4 # :title: rbot utilities provider
6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
9 # Copyright:: (C) 2002-2006 Tom Gilbert
10 # Copyright:: (C) 2007 Giuseppe Bilotta
12 # TODO some of these Utils should be rewritten as extensions to the approriate
13 # standard Ruby classes and accordingly be moved to extends.rb
20 require 'htmlentities'
21 $we_have_html_entities_decoder = true
23 gems = require 'rubygems' rescue false
27 $we_have_html_entities_decoder = false
50 # extras codes, for future use...
64 'otimes' => '⊗',
73 'Epsilon' => 'Ε',
77 'Upsilon' => 'Υ',
79 'there4' => '∴',
84 'rsaquo' => '›',
106 'lfloor' => '⌊',
113 'clubs' => '♣',
114 'diams' => '♦',
121 'Scaron' => 'Š',
127 'sbquo' => '‚',
140 'infin' => '∞',
145 'thinsp' => ' ',
147 'bdquo' => '„',
154 'mdash' => '—',
156 'permil' => '‰',
161 'forall' => '∀',
163 'rceil' => '⌉',
166 'lambda' => 'λ',
170 'dagger' => '†',
173 'image' => 'ℑ',
174 'alefsym' => 'ℵ',
180 'frasl' => '⁄',
182 'lowast' => '∗',
193 'oline' => '‾',
200 'empty' => '∅',
207 'weierp' => '℘',
212 'omicron' => 'ο',
213 'upsilon' => 'υ',
215 'Lambda' => 'Λ',
222 'scaron' => 'š',
223 'lsquo' => '‘',
231 'hellip' => '…',
235 'rfloor' => '⌋',
237 'crarr' => '↵',
239 'notin' => '∉',
240 'exist' => '∃',
243 'Dagger' => '‡',
244 'oplus' => '⊕',
250 'lsaquo' => '‹',
252 'Omicron' => 'Ο',
267 'sigmaf' => 'ς',
269 'minus' => '−',
272 'epsilon' => 'ε',
283 'spades' => '♠',
284 'rsquo' => '’',
288 'thetasym' => 'ϑ',
292 'ldquo' => '“',
293 'hearts' => '♥',
306 # miscellaneous useful functions
309 SEC_PER_HR = SEC_PER_MIN * 60
310 SEC_PER_DAY = SEC_PER_HR * 24
311 SEC_PER_MNTH = SEC_PER_DAY * 30
312 SEC_PER_YR = SEC_PER_MNTH * 12
314 def Utils.secs_to_string_case(array, var, string, plural)
317 array << "1 #{string}"
319 array << "#{var} #{plural}"
323 # turn a number of seconds into a human readable string, e.g
324 # 2 days, 3 hours, 18 minutes, 10 seconds
325 def Utils.secs_to_string(secs)
327 years, secs = secs.divmod SEC_PER_YR
328 secs_to_string_case(ret, years, "year", "years") if years > 0
329 months, secs = secs.divmod SEC_PER_MNTH
330 secs_to_string_case(ret, months, "month", "months") if months > 0
331 days, secs = secs.divmod SEC_PER_DAY
332 secs_to_string_case(ret, days, "day", "days") if days > 0
333 hours, secs = secs.divmod SEC_PER_HR
334 secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
335 mins, secs = secs.divmod SEC_PER_MIN
336 secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
338 secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
341 raise "Empty ret array!"
345 return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
350 def Utils.safe_exec(command, *args)
353 return p.readlines.join("\n")
358 rescue Exception => e
359 puts "exec of #{command} led to exception: #{e.inspect}"
362 puts "exec of #{command} failed"
369 @@safe_save_dir = nil unless defined?(@@safe_save_dir)
370 def Utils.set_safe_save_dir(str)
371 @@safe_save_dir = str.dup
374 def Utils.safe_save(file)
375 raise 'No safe save directory defined!' if @@safe_save_dir.nil?
376 basename = File.basename(file)
377 temp = Tempfile.new(basename,@@safe_save_dir)
379 yield temp if block_given?
381 File.rename(temp.path, file)
385 # returns a string containing the result of an HTTP GET on the uri
386 def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
388 # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
389 Net::HTTP.version_1_2
390 # (so we support the 1_1 api anyway, avoids problems)
392 uri = URI.parse uristr
395 query += "?#{uri.query}"
400 if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
401 proxy_host = proxy_uri.host
402 proxy_port = proxy_uri.port
406 http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
407 http.open_timeout = opentimeout
408 http.read_timeout = readtimeout
411 resp = http.get(query)
412 if resp.code == "200"
418 error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
423 def Utils.decode_html_entities(str)
424 if $we_have_html_entities_decoder
425 return HTMLEntities.decode_entities(str)
427 str.gsub(/(&(.+?);)/) {
429 # remove the 0-paddng from unicode integers
431 symbol = "##{$1.to_i.to_s}"
434 # output the symbol's irc-translated character, or a * if it's unknown
435 UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
440 HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
441 PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
443 # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
444 # to mark actual text
445 AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
447 # At worst, we can try stuff which is comprised between two <br>
448 AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
450 # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
451 # If possible, grab the one after the first heading
453 # It is possible to pass some options to determine how the stripping
454 # occurs. Currently supported options are
455 # * :strip => Regex or String to strip at the beginning of the obtained
457 # * :min_spaces => Minimum number of spaces a paragraph should have
459 def Utils.ircify_first_html_par(xml_org, opts={})
460 xml = xml_org.gsub(/<!--.*?-->/, '')
463 strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
465 min_spaces = opts[:min_spaces] || 8
466 min_spaces = 0 if min_spaces < 0
471 debug "Minimum number of spaces: #{min_spaces}"
472 header_found = xml.match(HX_REGEX)
475 while txt.empty? or txt.count(" ") < min_spaces
476 candidate = header_found[PAR_REGEX]
477 break unless candidate
478 txt = candidate.ircify_html
480 txt.sub!(strip, '') if strip
481 debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
485 return txt unless txt.empty? or txt.count(" ") < min_spaces
487 # If we haven't found a first par yet, try to get it from the whole
490 while txt.empty? or txt.count(" ") < min_spaces
491 candidate = header_found[PAR_REGEX]
492 break unless candidate
493 txt = candidate.ircify_html
495 txt.sub!(strip, '') if strip
496 debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
499 return txt unless txt.empty? or txt.count(" ") < min_spaces
501 # Nothing yet ... let's get drastic: we look for non-par elements too,
502 # but only for those that match something that we know is likely to
507 while txt.empty? or txt.count(" ") < min_spaces
508 candidate = header_found[AFTER_PAR1_REGEX]
509 break unless candidate
510 txt = candidate.ircify_html
512 txt.sub!(strip, '') if strip
513 debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
516 return txt unless txt.empty? or txt.count(" ") < min_spaces
520 while txt.empty? or txt.count(" ") < min_spaces
521 candidate = header_found[AFTER_PAR2_REGEX]
522 break unless candidate
523 txt = candidate.ircify_html
525 txt.sub!(strip, '') if strip
526 debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
529 debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
530 return txt unless txt.count(" ") < min_spaces
535 # Get the first pars of the first _count_ _urls_.
536 # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
537 # and echoed as replies to the IRC message passed as _opts_ :message.
539 def Utils.get_first_pars(urls, count, opts={})
542 while count > 0 and urls.length > 0
546 # FIXME what happens if some big file is returned? We should share
547 # code with the url plugin to only retrieve partial file content!
548 xml = opts[:http_util].get_cached(url)
550 debug "Unable to retrieve #{url}"
553 par = Utils.ircify_first_html_par(xml, opts)
555 debug "No first par found\n#{xml}"
556 # FIXME only do this if the 'url' plugin is loaded
557 # TODO even better, put the code here
558 # par = @bot.plugins['url'].get_title_from_html(xml)
561 msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg