git.oblomov.eu Git - rbot/blob - data/rbot/plugins/url.rb

   1 define_structure :Url, :channel, :nick, :time, :url, :info
   2
   3 class ::UrlLinkError < RuntimeError
   4 end
   5
   6 class UrlPlugin < Plugin
   7   TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
   8   LINK_INFO = "[Link Info]"
   9   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  10
  11   BotConfig.register BotConfigIntegerValue.new('url.max_urls',
  12     :default => 100, :validate => Proc.new{|v| v > 0},
  13     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  14   BotConfig.register BotConfigIntegerValue.new('url.display_link_info',
  15     :default => 0,
  16     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  17   BotConfig.register BotConfigBooleanValue.new('url.titles_only',
  18     :default => false,
  19     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  20   BotConfig.register BotConfigBooleanValue.new('url.first_par',
  21     :default => false,
  22     :desc => "Also try to get the first paragraph of a web page")
  23   BotConfig.register BotConfigBooleanValue.new('url.info_on_list',
  24     :default => false,
  25     :desc => "Show link info when listing/searching for urls")
  26
  27
  28   def initialize
  29     super
  30     @registry.set_default(Array.new)
  31     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  32       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  33     end
  34   end
  35
  36   def help(plugin, topic="")
  37     "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  38   end
  39
  40   def get_title_from_html(pagedata)
  41     return unless TITLE_RE.match(pagedata)
  42     $1.ircify_html
  43   end
  44
  45   def get_title_for_url(uri_str, nick = nil, channel = nil, ircline = nil)
  46
  47     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  48     return if url.scheme !~ /https?/
  49
  50     logopts = Hash.new
  51     logopts[:nick] = nick if nick
  52     logopts[:channel] = channel if channel
  53     logopts[:ircline] = ircline if ircline
  54
  55     title = nil
  56     extra = String.new
  57
  58     begin
  59       debug "+ getting #{url.request_uri}"
  60       @bot.httputil.get_response(url) { |resp|
  61         case resp
  62         when Net::HTTPSuccess
  63
  64           debug resp.to_hash
  65
  66           if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
  67             # The page is text or HTML, so we can try finding a title and, if
  68             # requested, the first par.
  69             #
  70             # We act differently depending on whether we want the first par or
  71             # not: in the first case we download the initial part and the parse
  72             # it; in the second case we only download as much as we need to find
  73             # the title
  74             #
  75             if @bot.config['url.first_par']
  76               partial = resp.partial_body(@bot.config['http.info_bytes'])
  77               logopts[:title] = title = get_title_from_html(partial)
  78               if url.fragment and not url.fragment.empty?
  79                 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
  80                 partial.sub!(fragreg,'')
  81               end
  82               first_par = Utils.ircify_first_html_par(partial, :strip => title)
  83               unless first_par.empty?
  84                 logopts[:extra] = first_par
  85                 extra << ", #{Bold}text#{Bold}: #{first_par}"
  86               end
  87               call_event(:url_added, url.to_s, logopts)
  88               return "#{Bold}title#{Bold}: #{title}#{extra}" if title
  89             else
  90               resp.partial_body(@bot.config['http.info_bytes']) { |part|
  91                 logopts[:title] = title = get_title_from_html(part)
  92                 call_event(:url_added, url.to_s, logopts)
  93                 return "#{Bold}title#{Bold}: #{title}" if title
  94               }
  95             end
  96           # if nothing was found, provide more basic info, as for non-html pages
  97           else
  98             resp.no_cache = true
  99           end
 100
 101           enc = resp['content-encoding']
 102           logopts[:extra] = String.new
 103           logopts[:extra] << "Content Type: #{resp['content-type']}"
 104           if enc
 105             logopts[:extra] << ", encoding: #{enc}"
 106             extra << ", #{Bold}encoding#{Bold}: #{enc}"
 107           end
 108
 109           unless @bot.config['url.titles_only']
 110             # content doesn't have title, just display info.
 111             size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 112             if size
 113               logopts[:extra] << ", size: #{size} bytes"
 114               size = ", #{Bold}size#{Bold}: #{size} bytes"
 115             end
 116             call_event(:url_added, url.to_s, logopts)
 117             return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
 118           end
 119           call_event(:url_added, url.to_s, logopts)
 120         else
 121           raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 122         end
 123       }
 124       return nil
 125     rescue Exception => e
 126       case e
 127       when UrlLinkError
 128         raise e
 129       else
 130         error e
 131         raise "connecting to site/processing information (#{e.message})"
 132       end
 133     end
 134   end
 135
 136   def listen(m)
 137     return unless m.kind_of?(PrivMessage)
 138     return if m.address?
 139
 140     escaped = URI.escape(m.message, OUR_UNSAFE)
 141     urls = URI.extract(escaped)
 142     return if urls.empty?
 143     debug "found urls #{urls.inspect}"
 144     list = @registry[m.target]
 145     urls_displayed = 0
 146     urls.each { |urlstr|
 147       debug "working on #{urlstr}"
 148       next unless urlstr =~ /^https?:/
 149       title = nil
 150       debug "display link info: #{@bot.config['url.display_link_info']}"
 151       if @bot.config['url.display_link_info'] > urls_displayed
 152         urls_displayed += 1
 153         Thread.start do
 154           debug "Getting title for #{urlstr}..."
 155           begin
 156             title = get_title_for_url urlstr, m.source.nick, m.channel, m.message
 157             if title
 158               m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
 159               debug "Title found!"
 160             else
 161               debug "Title not found!"
 162             end
 163           rescue => e
 164             m.reply "Error #{e.message}"
 165           end
 166         end
 167       end
 168
 169       # check to see if this url is already listed
 170       next if list.find {|u| u.url == urlstr }
 171
 172       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 173       debug "#{list.length} urls so far"
 174       if list.length > @bot.config['url.max_urls']
 175         list.pop
 176       end
 177       debug "storing url #{url.url}"
 178       list.unshift url
 179       debug "#{list.length} urls now"
 180     }
 181     @registry[m.target] = list
 182   end
 183
 184   def reply_urls(opts={})
 185     list = opts[:list]
 186     max = opts[:max]
 187     channel = opts[:channel]
 188     m = opts[:msg]
 189     return unless list and max and m
 190     list[0..(max-1)].each do |url|
 191       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 192       if @bot.config['url.info_on_list']
 193         title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
 194         # If the url info was missing and we now have some, try to upgrade it
 195         if channel and title and not url.info
 196           ll = @registry[channel]
 197           debug ll
 198           if el = ll.find { |u| u.url == url.url }
 199             el.info = title
 200             @registry[channel] = ll
 201           end
 202         end
 203         disp << " --> #{title}" if title
 204       end
 205       m.reply disp, :overlong => :truncate
 206     end
 207   end
 208
 209   def urls(m, params)
 210     channel = params[:channel] ? params[:channel] : m.target
 211     max = params[:limit].to_i
 212     max = 10 if max > 10
 213     max = 1 if max < 1
 214     list = @registry[channel]
 215     if list.empty?
 216       m.reply "no urls seen yet for channel #{channel}"
 217     else
 218       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 219     end
 220   end
 221
 222   def search(m, params)
 223     channel = params[:channel] ? params[:channel] : m.target
 224     max = params[:limit].to_i
 225     string = params[:string]
 226     max = 10 if max > 10
 227     max = 1 if max < 1
 228     regex = Regexp.new(string, Regexp::IGNORECASE)
 229     list = @registry[channel].find_all {|url|
 230       regex.match(url.url) || regex.match(url.nick) ||
 231         (@bot.config['url.info_on_list'] && regex.match(url.info))
 232     }
 233     if list.empty?
 234       m.reply "no matches for channel #{channel}"
 235     else
 236       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 237     end
 238   end
 239 end
 240
 241 plugin = UrlPlugin.new
 242 plugin.map 'urls search :channel :limit :string', :action => 'search',
 243                           :defaults => {:limit => 4},
 244                           :requirements => {:limit => /^\d+$/},
 245                           :public => false
 246 plugin.map 'urls search :limit :string', :action => 'search',
 247                           :defaults => {:limit => 4},
 248                           :requirements => {:limit => /^\d+$/},
 249                           :private => false
 250 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 251                           :requirements => {:limit => /^\d+$/},
 252                           :public => false
 253 plugin.map 'urls :limit', :defaults => {:limit => 4},
 254                           :requirements => {:limit => /^\d+$/},
 255                           :private => false