1 define_structure :Url, :channel, :nick, :time, :url, :info
3 class ::UrlLinkError < RuntimeError
6 class UrlPlugin < Plugin
7 TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
8 LINK_INFO = "[Link Info]"
9 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
11 BotConfig.register BotConfigIntegerValue.new('url.max_urls',
12 :default => 100, :validate => Proc.new{|v| v > 0},
13 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
14 BotConfig.register BotConfigIntegerValue.new('url.display_link_info',
16 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
17 BotConfig.register BotConfigBooleanValue.new('url.titles_only',
19 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
20 BotConfig.register BotConfigBooleanValue.new('url.first_par',
22 :desc => "Also try to get the first paragraph of a web page")
23 BotConfig.register BotConfigBooleanValue.new('url.info_on_list',
25 :desc => "Show link info when listing/searching for urls")
30 @registry.set_default(Array.new)
31 unless @bot.config['url.display_link_info'].kind_of?(Integer)
32 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
36 def help(plugin, topic="")
37 "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
40 def get_title_from_html(pagedata)
41 return unless TITLE_RE.match(pagedata)
45 def get_title_for_url(uri_str, nick = nil, channel = nil, ircline = nil)
47 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
48 return if url.scheme !~ /https?/
51 logopts[:nick] = nick if nick
52 logopts[:channel] = channel if channel
53 logopts[:ircline] = ircline if ircline
59 debug "+ getting #{url.request_uri}"
60 @bot.httputil.get_response(url) { |resp|
66 if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
67 # The page is text or HTML, so we can try finding a title and, if
68 # requested, the first par.
70 # We act differently depending on whether we want the first par or
71 # not: in the first case we download the initial part and the parse
72 # it; in the second case we only download as much as we need to find
75 if @bot.config['url.first_par']
76 partial = resp.partial_body(@bot.config['http.info_bytes'])
77 logopts[:title] = title = get_title_from_html(partial)
78 if url.fragment and not url.fragment.empty?
79 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
80 partial.sub!(fragreg,'')
82 first_par = Utils.ircify_first_html_par(partial, :strip => title)
83 unless first_par.empty?
84 logopts[:extra] = first_par
85 extra << ", #{Bold}text#{Bold}: #{first_par}"
87 call_event(:url_added, url.to_s, logopts)
88 return "#{Bold}title#{Bold}: #{title}#{extra}" if title
90 resp.partial_body(@bot.config['http.info_bytes']) { |part|
91 logopts[:title] = title = get_title_from_html(part)
92 call_event(:url_added, url.to_s, logopts)
93 return "#{Bold}title#{Bold}: #{title}" if title
96 # if nothing was found, provide more basic info, as for non-html pages
101 enc = resp['content-encoding']
102 logopts[:extra] = String.new
103 logopts[:extra] << "Content Type: #{resp['content-type']}"
105 logopts[:extra] << ", encoding: #{enc}"
106 extra << ", #{Bold}encoding#{Bold}: #{enc}"
109 unless @bot.config['url.titles_only']
110 # content doesn't have title, just display info.
111 size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
113 logopts[:extra] << ", size: #{size} bytes"
114 size = ", #{Bold}size#{Bold}: #{size} bytes"
116 call_event(:url_added, url.to_s, logopts)
117 return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
119 call_event(:url_added, url.to_s, logopts)
121 raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
125 rescue Exception => e
131 raise "connecting to site/processing information (#{e.message})"
137 return unless m.kind_of?(PrivMessage)
140 escaped = URI.escape(m.message, OUR_UNSAFE)
141 urls = URI.extract(escaped)
142 return if urls.empty?
143 debug "found urls #{urls.inspect}"
144 list = @registry[m.target]
147 debug "working on #{urlstr}"
148 next unless urlstr =~ /^https?:/
150 debug "display link info: #{@bot.config['url.display_link_info']}"
151 if @bot.config['url.display_link_info'] > urls_displayed
154 debug "Getting title for #{urlstr}..."
156 title = get_title_for_url urlstr, m.source.nick, m.channel, m.message
158 m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
161 debug "Title not found!"
164 m.reply "Error #{e.message}"
169 # check to see if this url is already listed
170 next if list.find {|u| u.url == urlstr }
172 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
173 debug "#{list.length} urls so far"
174 if list.length > @bot.config['url.max_urls']
177 debug "storing url #{url.url}"
179 debug "#{list.length} urls now"
181 @registry[m.target] = list
184 def reply_urls(opts={})
187 channel = opts[:channel]
189 return unless list and max and m
190 list[0..(max-1)].each do |url|
191 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
192 if @bot.config['url.info_on_list']
193 title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
194 # If the url info was missing and we now have some, try to upgrade it
195 if channel and title and not url.info
196 ll = @registry[channel]
198 if el = ll.find { |u| u.url == url.url }
200 @registry[channel] = ll
203 disp << " --> #{title}" if title
205 m.reply disp, :overlong => :truncate
210 channel = params[:channel] ? params[:channel] : m.target
211 max = params[:limit].to_i
214 list = @registry[channel]
216 m.reply "no urls seen yet for channel #{channel}"
218 reply_urls :msg => m, :channel => channel, :list => list, :max => max
222 def search(m, params)
223 channel = params[:channel] ? params[:channel] : m.target
224 max = params[:limit].to_i
225 string = params[:string]
228 regex = Regexp.new(string, Regexp::IGNORECASE)
229 list = @registry[channel].find_all {|url|
230 regex.match(url.url) || regex.match(url.nick) ||
231 (@bot.config['url.info_on_list'] && regex.match(url.info))
234 m.reply "no matches for channel #{channel}"
236 reply_urls :msg => m, :channel => channel, :list => list, :max => max
241 plugin = UrlPlugin.new
242 plugin.map 'urls search :channel :limit :string', :action => 'search',
243 :defaults => {:limit => 4},
244 :requirements => {:limit => /^\d+$/},
246 plugin.map 'urls search :limit :string', :action => 'search',
247 :defaults => {:limit => 4},
248 :requirements => {:limit => /^\d+$/},
250 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
251 :requirements => {:limit => /^\d+$/},
253 plugin.map 'urls :limit', :defaults => {:limit => 4},
254 :requirements => {:limit => /^\d+$/},