git.oblomov.eu Git - rbot/blob - lib/rbot/core/utils/utils.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: rbot utilities provider
   5 #
   6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
   7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
   8 #
   9 # TODO some of these Utils should be rewritten as extensions to the approriate
  10 # standard Ruby classes and accordingly be moved to extends.rb
  11
  12 require 'tempfile'
  13 require 'set'
  14
  15 # Try to load htmlentities, fall back to an HTML escape table.
  16 begin
  17   require 'htmlentities'
  18 rescue LoadError
  19   gems = nil
  20   begin
  21     gems = require 'rubygems'
  22   rescue LoadError
  23     gems = false
  24   end
  25   if gems
  26     retry
  27   else
  28     module ::Irc
  29       module Utils
  30         UNESCAPE_TABLE = {
  31     'laquo' => '«',
  32     'raquo' => '»',
  33     'quot' => '"',
  34     'apos' => '\'',
  35     'micro' => 'µ',
  36     'copy' => '©',
  37     'trade' => '™',
  38     'reg' => '®',
  39     'amp' => '&',
  40     'lt' => '<',
  41     'gt' => '>',
  42     'hellip' => '…',
  43     'nbsp' => ' ',
  44     'Agrave' => 'À',
  45     'Aacute' => 'Á',
  46     'Acirc' => 'Â',
  47     'Atilde' => 'Ã',
  48     'Auml' => 'Ä',
  49     'Aring' => 'Å',
  50     'AElig' => 'Æ',
  51     'OElig' => 'Œ',
  52     'Ccedil' => 'Ç',
  53     'Egrave' => 'È',
  54     'Eacute' => 'É',
  55     'Ecirc' => 'Ê',
  56     'Euml' => 'Ë',
  57     'Igrave' => 'Ì',
  58     'Iacute' => 'Í',
  59     'Icirc' => 'Î',
  60     'Iuml' => 'Ï',
  61     'ETH' => 'Ð',
  62     'Ntilde' => 'Ñ',
  63     'Ograve' => 'Ò',
  64     'Oacute' => 'Ó',
  65     'Ocirc' => 'Ô',
  66     'Otilde' => 'Õ',
  67     'Ouml' => 'Ö',
  68     'Oslash' => 'Ø',
  69     'Ugrave' => 'Ù',
  70     'Uacute' => 'Ú',
  71     'Ucirc' => 'Û',
  72     'Uuml' => 'Ü',
  73     'Yacute' => 'Ý',
  74     'THORN' => 'Þ',
  75     'szlig' => 'ß',
  76     'agrave' => 'à',
  77     'aacute' => 'á',
  78     'acirc' => 'â',
  79     'atilde' => 'ã',
  80     'auml' => 'ä',
  81     'aring' => 'å',
  82     'aelig' => 'æ',
  83     'oelig' => 'œ',
  84     'ccedil' => 'ç',
  85     'egrave' => 'è',
  86     'eacute' => 'é',
  87     'ecirc' => 'ê',
  88     'euml' => 'ë',
  89     'igrave' => 'ì',
  90     'iacute' => 'í',
  91     'icirc' => 'î',
  92     'iuml' => 'ï',
  93     'eth' => 'ð',
  94     'ntilde' => 'ñ',
  95     'ograve' => 'ò',
  96     'oacute' => 'ó',
  97     'ocirc' => 'ô',
  98     'otilde' => 'õ',
  99     'ouml' => 'ö',
 100     'oslash' => 'ø',
 101     'ugrave' => 'ù',
 102     'uacute' => 'ú',
 103     'ucirc' => 'û',
 104     'uuml' => 'ü',
 105     'yacute' => 'ý',
 106     'thorn' => 'þ',
 107     'yuml' => 'ÿ'
 108         }
 109       end
 110     end
 111   end
 112 end
 113
 114 begin
 115   require 'hpricot'
 116   module ::Irc
 117     module Utils
 118       AFTER_PAR_PATH = /^(?:div|span)$/
 119       AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/
 120       AFTER_PAR_CLASS = /body|message|text/i
 121     end
 122   end
 123 rescue LoadError
 124   gems = nil
 125   begin
 126     gems = require 'rubygems'
 127   rescue LoadError
 128     gems = false
 129   end
 130   if gems
 131     retry
 132   else
 133     module ::Irc
 134       module Utils
 135         # Some regular expressions to manage HTML data
 136
 137         # Title
 138         TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
 139
 140         # H1, H2, etc
 141         HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
 142         # A paragraph
 143         PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 144
 145         # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
 146         # to mark actual text
 147         AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 148
 149         # At worst, we can try stuff which is comprised between two <br>
 150         AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
 151       end
 152     end
 153   end
 154 end
 155
 156 module ::Irc
 157
 158   # Miscellaneous useful functions
 159   module Utils
 160     @@bot = nil unless defined? @@bot
 161     @@safe_save_dir = nil unless defined?(@@safe_save_dir)
 162
 163     # The bot instance
 164     def Utils.bot
 165       @@bot
 166     end
 167
 168     # Set up some Utils routines which depend on the associated bot.
 169     def Utils.bot=(b)
 170       debug "initializing utils"
 171       @@bot = b
 172       @@safe_save_dir = "#{@@bot.botclass}/safe_save"
 173     end
 174
 175
 176     # Seconds per minute
 177     SEC_PER_MIN = 60
 178     # Seconds per hour
 179     SEC_PER_HR = SEC_PER_MIN * 60
 180     # Seconds per day
 181     SEC_PER_DAY = SEC_PER_HR * 24
 182     # Seconds per week
 183     SEC_PER_WK = SEC_PER_DAY * 7
 184     # Seconds per (30-day) month
 185     SEC_PER_MNTH = SEC_PER_DAY * 30
 186     # Second per (non-leap) year
 187     SEC_PER_YR = SEC_PER_DAY * 365
 188
 189     # Auxiliary method needed by Utils.secs_to_string
 190     def Utils.secs_to_string_case(array, var, string, plural)
 191       case var
 192       when 1
 193         array << "1 #{string}"
 194       else
 195         array << "#{var} #{plural}"
 196       end
 197     end
 198
 199     # Turn a number of seconds into a human readable string, e.g
 200     # 2 days, 3 hours, 18 minutes and 10 seconds
 201     def Utils.secs_to_string(secs)
 202       ret = []
 203       years, secs = secs.divmod SEC_PER_YR
 204       secs_to_string_case(ret, years, _("year"), _("years")) if years > 0
 205       months, secs = secs.divmod SEC_PER_MNTH
 206       secs_to_string_case(ret, months, _("month"), _("months")) if months > 0
 207       days, secs = secs.divmod SEC_PER_DAY
 208       secs_to_string_case(ret, days, _("day"), _("days")) if days > 0
 209       hours, secs = secs.divmod SEC_PER_HR
 210       secs_to_string_case(ret, hours, _("hour"), _("hours")) if hours > 0
 211       mins, secs = secs.divmod SEC_PER_MIN
 212       secs_to_string_case(ret, mins, _("minute"), _("minutes")) if mins > 0
 213       secs = secs.to_i
 214       secs_to_string_case(ret, secs, _("second"), _("seconds")) if secs > 0 or ret.empty?
 215       case ret.length
 216       when 0
 217         raise "Empty ret array!"
 218       when 1
 219         return ret.to_s
 220       else
 221         return [ret[0, ret.length-1].join(", ") , ret[-1]].join(_(" and "))
 222       end
 223     end
 224
 225     # Turn a number of seconds into a hours:minutes:seconds e.g.
 226     # 3:18:10 or 5'12" or 7s
 227     #
 228     def Utils.secs_to_short(seconds)
 229       secs = seconds.to_i # make sure it's an integer
 230       mins, secs = secs.divmod 60
 231       hours, mins = mins.divmod 60
 232       if hours > 0
 233         return ("%s:%s:%s" % [hours, mins, secs])
 234       elsif mins > 0
 235         return ("%s'%s\"" % [mins, secs])
 236       else
 237         return ("%ss" % [secs])
 238       end
 239     end
 240
 241     # Returns human readable time.
 242     # Like: 5 days ago
 243     #       about one hour ago
 244     # options
 245     # :start_date, sets the time to measure against, defaults to now
 246     # :date_format, used with <tt>to_formatted_s<tt>, default to :default
 247     def Utils.timeago(time, options = {})
 248       start_date = options.delete(:start_date) || Time.new
 249       date_format = options.delete(:date_format) || "%x"
 250       delta = (start_date - time).round
 251       if delta.abs < 2
 252         _("right now")
 253       else
 254         distance = Utils.age_string(delta)
 255         if delta < 0
 256           _("%{d} from now") % {:d => distance}
 257         else
 258           _("%{d} ago") % {:d => distance}
 259         end
 260       end
 261     end
 262
 263     # Converts age in seconds to "nn units". Inspired by previous attempts
 264     # but also gitweb's age_string() sub
 265     def Utils.age_string(secs)
 266       case
 267       when secs < 0
 268         Utils.age_string(-secs)
 269       when secs > 2*SEC_PER_YR
 270         _("%{m} years") % { :m => secs/SEC_PER_YR }
 271       when secs > 2*SEC_PER_MNTH
 272         _("%{m} months") % { :m => secs/SEC_PER_MNTH }
 273       when secs > 2*SEC_PER_WK
 274         _("%{m} weeks") % { :m => secs/SEC_PER_WK }
 275       when secs > 2*SEC_PER_DAY
 276         _("%{m} days") % { :m => secs/SEC_PER_DAY }
 277       when secs > 2*SEC_PER_HR
 278         _("%{m} hours") % { :m => secs/SEC_PER_HR }
 279       when (20*SEC_PER_MIN..40*SEC_PER_MIN).include?(secs)
 280         _("half an hour")
 281       when (50*SEC_PER_MIN..70*SEC_PER_MIN).include?(secs)
 282         # _("about one hour")
 283         _("an hour")
 284       when (80*SEC_PER_MIN..100*SEC_PER_MIN).include?(secs)
 285         _("an hour and a half")
 286       when secs > 2*SEC_PER_MIN
 287         _("%{m} minutes") % { :m => secs/SEC_PER_MIN }
 288       when secs > 1
 289         _("%{m} seconds") % { :m => secs }
 290       else
 291         _("one second")
 292       end
 293     end
 294
 295     # Execute an external program, returning a String obtained by redirecting
 296     # the program's standards errors and output
 297     #
 298     def Utils.safe_exec(command, *args)
 299       IO.popen("-") { |p|
 300         if p
 301           return p.readlines.join("\n")
 302         else
 303           begin
 304             $stderr.reopen($stdout)
 305             exec(command, *args)
 306           rescue Exception => e
 307             puts "exec of #{command} led to exception: #{e.pretty_inspect}"
 308             Kernel::exit! 0
 309           end
 310           puts "exec of #{command} failed"
 311           Kernel::exit! 0
 312         end
 313       }
 314     end
 315
 316
 317     # Safely (atomically) save to _file_, by passing a tempfile to the block
 318     # and then moving the tempfile to its final location when done.
 319     #
 320     # call-seq: Utils.safe_save(file, &block)
 321     #
 322     def Utils.safe_save(file)
 323       raise 'No safe save directory defined!' if @@safe_save_dir.nil?
 324       basename = File.basename(file)
 325       temp = Tempfile.new(basename,@@safe_save_dir)
 326       temp.binmode
 327       yield temp if block_given?
 328       temp.close
 329       File.rename(temp.path, file)
 330     end
 331
 332
 333     # Decode HTML entities in the String _str_, using HTMLEntities if the
 334     # package was found, or UNESCAPE_TABLE otherwise.
 335     #
 336     def Utils.decode_html_entities(str)
 337       if defined? ::HTMLEntities
 338         return HTMLEntities.decode_entities(str)
 339       else
 340         str.gsub(/(&(.+?);)/) {
 341           symbol = $2
 342           # remove the 0-paddng from unicode integers
 343           if symbol =~ /^#(\d+)$/
 344             symbol = $1.to_i.to_s
 345           end
 346
 347           # output the symbol's irc-translated character, or a * if it's unknown
 348           UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [symbol.to_i].pack("U") : '*')
 349         }
 350       end
 351     end
 352
 353     # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
 354     # If possible, grab the one after the first heading
 355     #
 356     # It is possible to pass some options to determine how the stripping
 357     # occurs. Currently supported options are
 358     # strip:: Regex or String to strip at the beginning of the obtained
 359     #         text
 360     # min_spaces:: minimum number of spaces a paragraph should have
 361     #
 362     def Utils.ircify_first_html_par(xml_org, opts={})
 363       if defined? ::Hpricot
 364         Utils.ircify_first_html_par_wh(xml_org, opts)
 365       else
 366         Utils.ircify_first_html_par_woh(xml_org, opts)
 367       end
 368     end
 369
 370     # HTML first par grabber using hpricot
 371     def Utils.ircify_first_html_par_wh(xml_org, opts={})
 372       doc = Hpricot(xml_org)
 373
 374       # Strip styles and scripts
 375       (doc/"style|script").remove
 376
 377       debug doc
 378
 379       strip = opts[:strip]
 380       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
 381
 382       min_spaces = opts[:min_spaces] || 8
 383       min_spaces = 0 if min_spaces < 0
 384
 385       txt = String.new
 386
 387       pre_h = pars = by_span = nil
 388
 389       while true
 390         debug "Minimum number of spaces: #{min_spaces}"
 391
 392         # Initial attempt: <p> that follows <h\d>
 393         if pre_h.nil?
 394           pre_h = Hpricot::Elements[]
 395           found_h = false
 396           doc.search("*") { |e|
 397             next if e.bogusetag?
 398             case e.pathname
 399             when /^h\d/
 400               found_h = true
 401             when 'p'
 402               pre_h << e if found_h
 403             end
 404           }
 405           debug "Hx: found: #{pre_h.pretty_inspect}"
 406         end
 407
 408         pre_h.each { |p|
 409           debug p
 410           txt = p.to_html.ircify_html
 411           txt.sub!(strip, '') if strip
 412           debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 413           break unless txt.empty? or txt.count(" ") < min_spaces
 414         }
 415
 416         return txt unless txt.empty? or txt.count(" ") < min_spaces
 417
 418         # Second natural attempt: just get any <p>
 419         pars = doc/"p" if pars.nil?
 420         debug "par: found: #{pars.pretty_inspect}"
 421         pars.each { |p|
 422           debug p
 423           txt = p.to_html.ircify_html
 424           txt.sub!(strip, '') if strip
 425           debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 426           break unless txt.empty? or txt.count(" ") < min_spaces
 427         }
 428
 429         return txt unless txt.empty? or txt.count(" ") < min_spaces
 430
 431         # Nothing yet ... let's get drastic: we look for non-par elements too,
 432         # but only for those that match something that we know is likely to
 433         # contain text
 434
 435         # Some blogging and forum platforms use spans or divs with a 'body' or
 436         # 'message' or 'text' in their class to mark actual text. Since we want
 437         # the class match to be partial and case insensitive, we collect
 438         # the common elements that may have this class and then filter out those
 439         # we don't need. If no divs or spans are found, we'll accept additional
 440         # elements too (td, tr, tbody, table).
 441         if by_span.nil?
 442           by_span = Hpricot::Elements[]
 443           extra = Hpricot::Elements[]
 444           doc.search("*") { |el|
 445             next if el.bogusetag?
 446             case el.pathname
 447             when AFTER_PAR_PATH
 448               by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
 449             when AFTER_PAR_EX
 450               extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
 451             end
 452           }
 453           if by_span.empty? and not extra.empty?
 454             by_span.concat extra
 455           end
 456           debug "other \#1: found: #{by_span.pretty_inspect}"
 457         end
 458
 459         by_span.each { |p|
 460           debug p
 461           txt = p.to_html.ircify_html
 462           txt.sub!(strip, '') if strip
 463           debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
 464           break unless txt.empty? or txt.count(" ") < min_spaces
 465         }
 466
 467         return txt unless txt.empty? or txt.count(" ") < min_spaces
 468
 469         # At worst, we can try stuff which is comprised between two <br>
 470         # TODO
 471
 472         debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
 473         return txt unless txt.count(" ") < min_spaces
 474         break if min_spaces == 0
 475         min_spaces /= 2
 476       end
 477     end
 478
 479     # HTML first par grabber without hpricot
 480     def Utils.ircify_first_html_par_woh(xml_org, opts={})
 481       xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
 482
 483       strip = opts[:strip]
 484       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
 485
 486       min_spaces = opts[:min_spaces] || 8
 487       min_spaces = 0 if min_spaces < 0
 488
 489       txt = String.new
 490
 491       while true
 492         debug "Minimum number of spaces: #{min_spaces}"
 493         header_found = xml.match(HX_REGEX)
 494         if header_found
 495           header_found = $'
 496           while txt.empty? or txt.count(" ") < min_spaces
 497             candidate = header_found[PAR_REGEX]
 498             break unless candidate
 499             txt = candidate.ircify_html
 500             header_found = $'
 501             txt.sub!(strip, '') if strip
 502             debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 503           end
 504         end
 505
 506         return txt unless txt.empty? or txt.count(" ") < min_spaces
 507
 508         # If we haven't found a first par yet, try to get it from the whole
 509         # document
 510         header_found = xml
 511         while txt.empty? or txt.count(" ") < min_spaces
 512           candidate = header_found[PAR_REGEX]
 513           break unless candidate
 514           txt = candidate.ircify_html
 515           header_found = $'
 516           txt.sub!(strip, '') if strip
 517           debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 518         end
 519
 520         return txt unless txt.empty? or txt.count(" ") < min_spaces
 521
 522         # Nothing yet ... let's get drastic: we look for non-par elements too,
 523         # but only for those that match something that we know is likely to
 524         # contain text
 525
 526         # Attempt #1
 527         header_found = xml
 528         while txt.empty? or txt.count(" ") < min_spaces
 529           candidate = header_found[AFTER_PAR1_REGEX]
 530           break unless candidate
 531           txt = candidate.ircify_html
 532           header_found = $'
 533           txt.sub!(strip, '') if strip
 534           debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
 535         end
 536
 537         return txt unless txt.empty? or txt.count(" ") < min_spaces
 538
 539         # Attempt #2
 540         header_found = xml
 541         while txt.empty? or txt.count(" ") < min_spaces
 542           candidate = header_found[AFTER_PAR2_REGEX]
 543           break unless candidate
 544           txt = candidate.ircify_html
 545           header_found = $'
 546           txt.sub!(strip, '') if strip
 547           debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
 548         end
 549
 550         debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
 551         return txt unless txt.count(" ") < min_spaces
 552         break if min_spaces == 0
 553         min_spaces /= 2
 554       end
 555     end
 556
 557     # This method extracts title, content (first par) and extra
 558     # information from the given document _doc_.
 559     #
 560     # _doc_ can be an URI, a Net::HTTPResponse or a String.
 561     #
 562     # If _doc_ is a String, only title and content information
 563     # are retrieved (if possible), using standard methods.
 564     #
 565     # If _doc_ is an URI or a Net::HTTPResponse, additional
 566     # information is retrieved, and special title/summary
 567     # extraction routines are used if possible.
 568     #
 569     def Utils.get_html_info(doc, opts={})
 570       case doc
 571       when String
 572         Utils.get_string_html_info(doc, opts)
 573       when Net::HTTPResponse
 574         Utils.get_resp_html_info(doc, opts)
 575       when URI
 576         ret = DataStream.new
 577         @@bot.httputil.get_response(doc) { |resp|
 578           ret.replace Utils.get_resp_html_info(resp, opts)
 579         }
 580         return ret
 581       else
 582         raise
 583       end
 584     end
 585
 586     class ::UrlLinkError < RuntimeError
 587     end
 588
 589     # This method extracts title, content (first par) and extra
 590     # information from the given Net::HTTPResponse _resp_.
 591     #
 592     # Currently, the only accepted options (in _opts_) are
 593     # uri_fragment:: the URI fragment of the original request
 594     # full_body::    get the whole body instead of
 595     #                @@bot.config['http.info_bytes'] bytes only
 596     #
 597     # Returns a DataStream with the following keys:
 598     # text:: the (partial) body
 599     # title:: the title of the document (if any)
 600     # content:: the first paragraph of the document (if any)
 601     # headers::
 602     #   the headers of the Net::HTTPResponse. The value is
 603     #   a Hash whose keys are lowercase forms of the HTTP
 604     #   header fields, and whose values are Arrays.
 605     #
 606     def Utils.get_resp_html_info(resp, opts={})
 607       case resp
 608       when Net::HTTPSuccess
 609         loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
 610         if loc and loc.fragment and not loc.fragment.empty?
 611           opts[:uri_fragment] ||= loc.fragment
 612         end
 613         ret = DataStream.new(opts.dup)
 614         ret[:headers] = resp.to_hash
 615         ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes'])
 616
 617         filtered = Utils.try_htmlinfo_filters(ret)
 618
 619         if filtered
 620           return filtered
 621         elsif resp['content-type'] =~ /^text\/|(?:x|ht)ml/
 622           ret.merge!(Utils.get_string_html_info(partial, opts))
 623         end
 624         return ret
 625       else
 626         raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 627       end
 628     end
 629
 630     # This method runs an appropriately-crafted DataStream _ds_ through the
 631     # filters in the :htmlinfo filter group, in order. If one of the filters
 632     # returns non-nil, its results are merged in _ds_ and returned. Otherwise
 633     # nil is returned.
 634     #
 635     # The input DataStream shuold have the downloaded HTML as primary key
 636     # (:text) and possibly a :headers key holding the resonse headers.
 637     #
 638     def Utils.try_htmlinfo_filters(ds)
 639       filters = @@bot.filter_names(:htmlinfo)
 640       return nil if filters.empty?
 641       cur = nil
 642       # TODO filter priority
 643       filters.each { |n|
 644         debug "testing filter #{n}"
 645         cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds)
 646         debug "returned #{cur.pretty_inspect}"
 647         break if cur
 648       }
 649       return ds.merge(cur) if cur
 650     end
 651
 652     # HTML info filters often need to check if the webpage location
 653     # of a passed DataStream _ds_ matches a given Regexp.
 654     def Utils.check_location(ds, rx)
 655       debug ds[:headers]
 656       if h = ds[:headers]
 657         loc = [h['x-rbot-location'],h['location']].flatten.grep(rx)
 658       end
 659       loc ||= []
 660       debug loc
 661       return loc.empty? ? nil : loc
 662     end
 663
 664     # This method extracts title and content (first par)
 665     # from the given HTML or XML document _text_, using
 666     # standard methods (String#ircify_html_title,
 667     # Utils.ircify_first_html_par)
 668     #
 669     # Currently, the only accepted option (in _opts_) is
 670     # uri_fragment:: the URI fragment of the original request
 671     #
 672     def Utils.get_string_html_info(text, opts={})
 673       debug "getting string html info"
 674       txt = text.dup
 675       title = txt.ircify_html_title
 676       debug opts
 677       if frag = opts[:uri_fragment] and not frag.empty?
 678         fragreg = /<a\s+(?:[^>]+\s+)?(?:name|id)=["']?#{frag}["']?[^>]*>/im
 679         debug fragreg
 680         debug txt
 681         if txt.match(fragreg)
 682           # grab the post-match
 683           txt = $'
 684         end
 685         debug txt
 686       end
 687       c_opts = opts.dup
 688       c_opts[:strip] ||= title
 689       content = Utils.ircify_first_html_par(txt, c_opts)
 690       content = nil if content.empty?
 691       return {:title => title, :content => content}
 692     end
 693
 694     # Get the first pars of the first _count_ _urls_.
 695     # The pages are downloaded using the bot httputil service.
 696     # Returns an array of the first paragraphs fetched.
 697     # If (optional) _opts_ :message is specified, those paragraphs are
 698     # echoed as replies to the IRC message passed as _opts_ :message
 699     #
 700     def Utils.get_first_pars(urls, count, opts={})
 701       idx = 0
 702       msg = opts[:message]
 703       retval = Array.new
 704       while count > 0 and urls.length > 0
 705         url = urls.shift
 706         idx += 1
 707
 708         begin
 709           info = Utils.get_html_info(URI.parse(url), opts)
 710
 711           par = info[:content]
 712           retval.push(par)
 713
 714           if par
 715             msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
 716             count -=1
 717           end
 718         rescue
 719           debug "Unable to retrieve #{url}: #{$!}"
 720           next
 721         end
 722       end
 723       return retval
 724     end
 725
 726   end
 727 end
 728
 729 Irc::Utils.bot = Irc::Bot::Plugins.manager.bot