/usr/lib/ruby/vendor_ruby/twitter-text/extractor.rb is in ruby-twitter-text 1.10.0+gem-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 | # encoding: UTF-8
class String
# Helper function to count the character length by first converting to an
# array. This is needed because with unicode strings, the return value
# of length may be incorrect
def char_length
if respond_to? :codepoints
length
else
chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
end
end
# Helper function to convert this string into an array of unicode characters.
def to_char_a
@to_char_a ||= if chars.kind_of?(Enumerable)
chars.to_a
else
char_array = []
0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
char_array
end
end
end
# Helper functions to return character offsets instead of byte offsets.
class MatchData
def char_begin(n)
if string.respond_to? :codepoints
self.begin(n)
else
string[0, self.begin(n)].char_length
end
end
def char_end(n)
if string.respond_to? :codepoints
self.end(n)
else
string[0, self.end(n)].char_length
end
end
end
module Twitter
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
# of usernames, lists, URLs and hashtags.
module Extractor extend self
# Remove overlapping entities.
# This returns a new array with no overlapping entities.
def remove_overlapping_entities(entities)
# sort by start index
entities = entities.sort_by{|entity| entity[:indices].first}
# remove duplicates
prev = nil
entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
entities
end
# Extracts all usernames, lists, hashtags and URLs in the Tweet <tt>text</tt>
# along with the indices for where the entity ocurred
# If the <tt>text</tt> is <tt>nil</tt> or contains no entity an empty array
# will be returned.
#
# If a block is given then it will be called for each entity.
def extract_entities_with_indices(text, options = {}, &block)
# extract all entities
entities = extract_urls_with_indices(text, options) +
extract_hashtags_with_indices(text, :check_url_overlap => false) +
extract_mentions_or_lists_with_indices(text) +
extract_cashtags_with_indices(text)
return [] if entities.empty?
entities = remove_overlapping_entities(entities)
entities.each(&block) if block_given?
entities
end
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
# will be returned.
#
# If a block is given then it will be called for each username.
def extract_mentioned_screen_names(text, &block) # :yields: username
screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
screen_names.each(&block) if block_given?
screen_names
end
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>
# along with the indices for where the mention ocurred. If the
# <tt>text</tt> is nil or contains no username mentions, an empty array
# will be returned.
#
# If a block is given, then it will be called with each username, the start
# index, and the end index in the <tt>text</tt>.
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
return [] unless text
possible_screen_names = []
extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
next unless list_slug.empty?
possible_screen_names << {
:screen_name => screen_name,
:indices => [start_position, end_position]
}
end
if block_given?
possible_screen_names.each do |mention|
yield mention[:screen_name], mention[:indices].first, mention[:indices].last
end
end
possible_screen_names
end
# Extracts a list of all usernames or lists mentioned in the Tweet <tt>text</tt>
# along with the indices for where the mention ocurred. If the
# <tt>text</tt> is nil or contains no username or list mentions, an empty array
# will be returned.
#
# If a block is given, then it will be called with each username, list slug, the start
# index, and the end index in the <tt>text</tt>. The list_slug will be an empty stirng
# if this is a username mention.
def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
return [] unless text =~ /[@@]/
possible_entries = []
text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
match_data = $~
after = $'
unless after =~ Twitter::Regex[:end_mention_match]
start_position = match_data.char_begin(3) - 1
end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
possible_entries << {
:screen_name => screen_name,
:list_slug => list_slug || "",
:indices => [start_position, end_position]
}
end
end
if block_given?
possible_entries.each do |mention|
yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
end
end
possible_entries
end
# Extracts the username username replied to in the Tweet <tt>text</tt>. If the
# <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
#
# If a block is given then it will be called with the username replied to (if any)
def extract_reply_screen_name(text) # :yields: username
return nil unless text
possible_screen_name = text.match(Twitter::Regex[:valid_reply])
return unless possible_screen_name.respond_to?(:captures)
return if $' =~ Twitter::Regex[:end_mention_match]
screen_name = possible_screen_name.captures.first
yield screen_name if block_given?
screen_name
end
# Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
# <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
# will be returned.
#
# If a block is given then it will be called for each URL.
def extract_urls(text, &block) # :yields: url
urls = extract_urls_with_indices(text).map{|u| u[:url]}
urls.each(&block) if block_given?
urls
end
# Extracts a list of all URLs included in the Tweet <tt>text</tt> along
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
# URLs an empty array will be returned.
#
# If a block is given then it will be called for each URL.
def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
urls = []
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
valid_url_match_data = $~
start_position = valid_url_match_data.char_begin(3)
end_position = valid_url_match_data.char_end(3)
# If protocol is missing and domain contains non-ASCII characters,
# extract ASCII-only domains.
if !protocol
next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
last_url = nil
domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
last_url = {
:url => ascii_domain,
:indices => [start_position + $~.char_begin(0),
start_position + $~.char_end(0)]
}
if path ||
ascii_domain =~ Twitter::Regex[:valid_special_short_domain] ||
ascii_domain !~ Twitter::Regex[:invalid_short_domain]
urls << last_url
end
end
# no ASCII-only domain found. Skip the entire URL
next unless last_url
# last_url only contains domain. Need to add path and query if they exist.
if path
# last_url was not added. Add it to urls here.
last_url[:url] = url.sub(domain, last_url[:url])
last_url[:indices][1] = end_position
end
else
# In the case of t.co URLs, don't allow additional path characters
if url =~ Twitter::Regex[:valid_tco_url]
url = $&
end_position = start_position + url.char_length
end
urls << {
:url => url,
:indices => [start_position, end_position]
}
end
end
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
urls
end
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
# will be returned. The array returned will not include the leading <tt>#</tt>
# character.
#
# If a block is given then it will be called for each hashtag.
def extract_hashtags(text, &block) # :yields: hashtag_text
hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
hashtags.each(&block) if block_given?
hashtags
end
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
# will be returned. The array returned will not include the leading <tt>#</tt>
# character.
#
# If a block is given then it will be called for each hashtag.
def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
return [] unless text =~ /[##]/
tags = []
text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
match_data = $~
start_position = match_data.char_begin(2)
end_position = match_data.char_end(3)
after = $'
unless after =~ Twitter::Regex[:end_hashtag_match]
tags << {
:hashtag => hash_text,
:indices => [start_position, end_position]
}
end
end
if options[:check_url_overlap]
# extract URLs
urls = extract_urls_with_indices(text)
unless urls.empty?
tags.concat(urls)
# remove duplicates
tags = remove_overlapping_entities(tags)
# remove URL entities
tags.reject!{|entity| !entity[:hashtag] }
end
end
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
tags
end
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
# will be returned. The array returned will not include the leading <tt>$</tt>
# character.
#
# If a block is given then it will be called for each cashtag.
def extract_cashtags(text, &block) # :yields: cashtag_text
cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
cashtags.each(&block) if block_given?
cashtags
end
# Extracts a list of all cashtags included in the Tweet <tt>text</tt>. If the
# <tt>text</tt> is <tt>nil</tt> or contains no cashtags an empty array
# will be returned. The array returned will not include the leading <tt>$</tt>
# character.
#
# If a block is given then it will be called for each cashtag.
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
return [] unless text =~ /\$/
tags = []
text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
match_data = $~
start_position = match_data.char_begin(2)
end_position = match_data.char_end(3)
tags << {
:cashtag => cash_text,
:indices => [start_position, end_position]
}
end
tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
tags
end
end
end
|