6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class ::UrlLinkError < RuntimeError
11 class UrlPlugin < Plugin
12 TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
13 LINK_INFO = "[Link Info]"
14 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
16 Config.register Config::IntegerValue.new('url.max_urls',
17 :default => 100, :validate => Proc.new{|v| v > 0},
18 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
19 Config.register Config::IntegerValue.new('url.display_link_info',
21 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
22 Config.register Config::BooleanValue.new('url.titles_only',
24 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
25 Config.register Config::BooleanValue.new('url.first_par',
27 :desc => "Also try to get the first paragraph of a web page")
28 Config.register Config::BooleanValue.new('url.info_on_list',
30 :desc => "Show link info when listing/searching for urls")
31 Config.register Config::ArrayValue.new('url.no_info_hosts',
32 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
33 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
34 :desc => "A list of regular expressions matching hosts for which no info should be provided")
39 @registry.set_default(Array.new)
40 unless @bot.config['url.display_link_info'].kind_of?(Integer)
41 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
46 def reset_no_info_hosts
47 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
48 debug "no info hosts regexp set to #{@no_info_hosts}"
51 def help(plugin, topic="")
52 "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
55 def get_title_from_html(pagedata)
56 return unless TITLE_RE.match(pagedata)
60 def get_title_for_url(uri_str, nick = nil, channel = nil, ircline = nil)
62 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
63 return if url.scheme !~ /https?/
65 if url.host =~ @no_info_hosts
66 return "Sorry, info retrieval for #{url.host} is disabled"
70 logopts[:nick] = nick if nick
71 logopts[:channel] = channel if channel
72 logopts[:ircline] = ircline if ircline
78 debug "+ getting #{url.request_uri}"
79 @bot.httputil.get_response(url) { |resp|
85 if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
86 # The page is text or HTML, so we can try finding a title and, if
87 # requested, the first par.
89 # We act differently depending on whether we want the first par or
90 # not: in the first case we download the initial part and the parse
91 # it; in the second case we only download as much as we need to find
94 if @bot.config['url.first_par']
95 partial = resp.partial_body(@bot.config['http.info_bytes'])
96 logopts[:title] = title = get_title_from_html(partial)
97 if url.fragment and not url.fragment.empty?
98 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
99 partial.sub!(fragreg,'')
101 first_par = Utils.ircify_first_html_par(partial, :strip => title)
102 unless first_par.empty?
103 logopts[:extra] = first_par
104 extra << ", #{Bold}text#{Bold}: #{first_par}"
106 call_event(:url_added, url.to_s, logopts)
107 return "#{Bold}title#{Bold}: #{title}#{extra}" if title
109 resp.partial_body(@bot.config['http.info_bytes']) { |part|
110 logopts[:title] = title = get_title_from_html(part)
111 call_event(:url_added, url.to_s, logopts)
112 return "#{Bold}title#{Bold}: #{title}" if title
115 # if nothing was found, provide more basic info, as for non-html pages
120 enc = resp['content-encoding']
121 logopts[:extra] = String.new
122 logopts[:extra] << "Content Type: #{resp['content-type']}"
124 logopts[:extra] << ", encoding: #{enc}"
125 extra << ", #{Bold}encoding#{Bold}: #{enc}"
128 unless @bot.config['url.titles_only']
129 # content doesn't have title, just display info.
130 size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
132 logopts[:extra] << ", size: #{size} bytes"
133 size = ", #{Bold}size#{Bold}: #{size} bytes"
135 call_event(:url_added, url.to_s, logopts)
136 return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
138 call_event(:url_added, url.to_s, logopts)
140 raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
144 rescue Exception => e
150 raise "connecting to site/processing information (#{e.message})"
155 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
156 return if urls.empty?
157 debug "found urls #{urls.inspect}"
159 list = @registry[m.target]
165 debug "working on #{urlstr}"
166 next unless urlstr =~ /^https?:/
168 debug "display link info: #{display_info}"
169 if display_info > urls_displayed
172 debug "Getting title for #{urlstr}..."
174 title = get_title_for_url urlstr, m.source.nick, m.channel, m.message
176 m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
179 debug "Title not found!"
182 m.reply "Error #{e.message}"
189 # check to see if this url is already listed
190 next if list.find {|u| u.url == urlstr }
192 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
193 debug "#{list.length} urls so far"
194 if list.length > @bot.config['url.max_urls']
197 debug "storing url #{url.url}"
199 debug "#{list.length} urls now"
201 @registry[m.target] = list
205 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
206 urls = URI.extract(escaped)
207 handle_urls(m, urls, params[:urls].length)
211 return unless m.kind_of?(PrivMessage)
214 escaped = URI.escape(m.message, OUR_UNSAFE)
215 urls = URI.extract(escaped)
219 def reply_urls(opts={})
222 channel = opts[:channel]
224 return unless list and max and m
225 list[0..(max-1)].each do |url|
226 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
227 if @bot.config['url.info_on_list']
228 title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
229 # If the url info was missing and we now have some, try to upgrade it
230 if channel and title and not url.info
231 ll = @registry[channel]
233 if el = ll.find { |u| u.url == url.url }
235 @registry[channel] = ll
238 disp << " --> #{title}" if title
240 m.reply disp, :overlong => :truncate
245 channel = params[:channel] ? params[:channel] : m.target
246 max = params[:limit].to_i
249 list = @registry[channel]
251 m.reply "no urls seen yet for channel #{channel}"
253 reply_urls :msg => m, :channel => channel, :list => list, :max => max
257 def search(m, params)
258 channel = params[:channel] ? params[:channel] : m.target
259 max = params[:limit].to_i
260 string = params[:string]
263 regex = Regexp.new(string, Regexp::IGNORECASE)
264 list = @registry[channel].find_all {|url|
265 regex.match(url.url) || regex.match(url.nick) ||
266 (@bot.config['url.info_on_list'] && regex.match(url.info))
269 m.reply "no matches for channel #{channel}"
271 reply_urls :msg => m, :channel => channel, :list => list, :max => max
276 plugin = UrlPlugin.new
277 plugin.map 'urls info *urls', :action => 'info'
278 plugin.map 'urls search :channel :limit :string', :action => 'search',
279 :defaults => {:limit => 4},
280 :requirements => {:limit => /^\d+$/},
282 plugin.map 'urls search :limit :string', :action => 'search',
283 :defaults => {:limit => 4},
284 :requirements => {:limit => /^\d+$/},
286 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
287 :requirements => {:limit => /^\d+$/},
289 plugin.map 'urls :limit', :defaults => {:limit => 4},
290 :requirements => {:limit => /^\d+$/},