From 97b8fcc02c8d5a8639394d199c63c297d4a66e30 Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Tue, 15 Jul 2008 13:25:52 +0200 Subject: [PATCH] Use SQLite to store data --- mark2.rb | 252 ++++++++++++++++++++++++++++++++----------------------- test.rb | 5 +- 2 files changed, 148 insertions(+), 109 deletions(-) diff --git a/mark2.rb b/mark2.rb index 62563fa..c793ac3 100644 --- a/mark2.rb +++ b/mark2.rb @@ -2,7 +2,14 @@ # Author: Giuseppe Bilotta # New markov chain plugin +$swig_runtime_data_type_pointer2 = nil +require 'sqlite3' + class Array + def sql_group + "("+self.join(',')+")" + end + def butlast first(self.size-1) end @@ -25,72 +32,27 @@ class Array end self[i, count] end -end - -class ChanceHash - - def initialize - @hash = Hash.new(0) - @picker = [] - @total = 0 - @valid_pick = false - end - - def size - @hash.size - end - - def [](key) - @hash[key] - end - - def keys - @hash.keys - end - - def increase(el) - if @hash.key?(el) - @hash[el] += 1 - else - @hash[el] = 1 - end - @valid_pick = false - return @hash[el] - end - def decrease(el) - if @hash.key?(el) - @hash[el] -= 1 - @hash.delete(el) if @hash[el] == 0 - end - @valid_pick = false - return @hash[el] + def pick_with_chance(chance) + pick = rand(chance) + self.each { |el, cch| + ch = cch.to_i + return el if pick < ch + pick -= ch + } + nil end +end - def make_picker - @picker.clear - total = 0 - @hash.each { |el, ch| - total += ch - @picker << [total, el] - } - @total = total - @valid_pick = true +class String + def quoted + "'" + SQLite3::Database.quote(self)+"'" end +end - def random - case @hash.size - when 0 - return nil - when 1 - return @hash.keys.first - else - make_picker unless @valid_pick - pick = rand(@total) - @picker.each { |ch, el| - return el if pick < ch - } - end +class NilClass + def quoted + 'NULL' end end @@ -101,49 +63,99 @@ class MarkovChainer WNW = /\w+|\W/u attr_reader :max_order - def initialize(ord=5) + + def order(i=0) + "order#{i}" + end + + def word(i) + "word#{i}" + end + + def initialize(db, ord=5) + @db = SQLite3::Database.new(db) + @db.synchronous=0 @max_order = ord - @mkv = Hash.new { |hash, key| - hash[key] = {:prev => ChanceHash.new, :next => ChanceHash.new} - } - @mkv[nil] = ChanceHash.new + + @db.execute("create table if not exists #{order(0).quoted} (#{word(0).quoted} text unique not null primary key, 'chance' integer not null default 0)") + 1.upto(@max_order+1) do |i| + cols = (0..i).map { |j| word(j).quoted} + + cmd = "create table if not exists " + cmd << order(i).quoted + " (" + cmd << cols.map { |c| c+' text'}.join(',') + cmd << ", 'chance' integer not null default 0" + cmd << ", unique#{cols.sql_group})" + @db.execute(cmd) + end end def words - @mkv[nil].keys + @db.execute("select word0 from order0") end - def add_one(sym) - # Don't add nil to order 0 - return unless sym - @mkv[nil].increase(sym.to_sym) + def num_words + @db.get_first_value("select count(*) from order0").to_i end - def add_before(array, prev) - raise "Not enough words in new data" if array.empty? - raise "Too many words in new data" if array.size > @max_order - # Don't add prev to chains whose first element is nil - return unless array.first - h = @mkv[array.dup] - h[:prev].increase(prev) + def where_selector(words, o={}) + offset = o[:offset].to_i + ar = [] + words.length.times do |i| + if words[i] + ar << word(i+offset) + "=" + words[i].quoted + else + ar << word(i+offset) + " ISNULL" + end + end + "where #{ar.join(' and ')}" end - def add_after(array, nxt) - raise "Not enough words in new data" if array.empty? - raise "Too many words in new data" if array.size > @max_order - # Don't add next to chains whose last element is nil - return unless array.last - h = @mkv[array.dup] - h[:next].increase(nxt) + def grouped_selector(words, o={}) + offset= o[:offset].to_i + wds = [] + cols = [] + words.length.times do |i| + cols << word(i) + wds << words[i].quoted + end + if o.key?(:chance) + cols << "chance" + wds << o[:chance].to_i + end + return [cols.sql_group, wds.sql_group] + end + + def add_one(sym) + # Don't add nil to order 0 + return unless sym + @db.transaction do |db| + if db.get_first_value("select chance from order0 where word0=?1", sym) + db.execute("update order0 set chance=chance+1 where word0=?1", sym) + else + db.execute("insert into order0 (word0, chance) values (?1, 1)", sym) + end + end + # puts @db.execute("select * from order0 where word0=?1", sym).inspect end def add_multi(array) raise "Too many words in new data" if array.size > @max_order + 1 - add_before(array.butfirst, array.first) - add_after(array.butlast, array.last) + table = order(array.length-1).quoted + cols, wds = grouped_selector(array, :chance => 1) + where = where_selector(array) + @db.transaction do |db| + if db.get_first_value("select chance from #{table} " + where) + db.execute("update #{table} set chance=chance+1 " + where) + else + db.execute("insert into #{table} #{cols} values #{wds}") + end + end + # puts @db.execute("select * from #{table} " + where).inspect end def add(*data) + # puts "adding #{data.inspect}" if data.size == 1 add_one(data.first) else @@ -152,18 +164,20 @@ class MarkovChainer end def simple_learn(text) - syms = text.scan(WNW).map { |w| w.intern } + return if text.empty? + syms = text.scan(WNW) syms.unshift(nil) syms.push(nil) - syms.size.times { |i| - ([@max_order, syms.size-i].min+1).times { |ord| - v = syms[i, ord+1] + syms.each_index do |i| + max_len = [@max_order+1, syms.size - i].min + 1.upto max_len do |len| + v = syms[i, len] # puts "Learning #{v.inspect}" add(*v) # pp @mkv - } - } + end + end end def learn(text, o={}) @@ -175,42 +189,62 @@ class MarkovChainer if lc simple_learn(text.downcase) end - - pp @mkv if defined? pp end def raw_next(syms, o={}) max_order = o.fetch(:max_order, @max_order) - ar = syms.last([max_order, syms.size].min) - if @mkv.key?(ar) - @mkv[ar][:next].random + if max_order > syms.length + max_order = syms.length + end + ar = syms.last(max_order) + # puts "raw_next #{max_order} #{ar.inspect}" + + table = order(max_order) + sel = word(max_order) + where = where_selector(ar) + + choices = @db.execute("select #{sel},chance from #{table} #{where}") + unless choices.empty? + sum = @db.get_first_value("select sum(chance) from #{table} #{where}").to_i + return choices.pick_with_chance(sum) else raw_next(ar.butfirst, o) end end def next(text, o={}) - syms = text.scan(WNW).map { |w| w.intern } + syms = text.scan(WNW) raw_next(syms, o) end def raw_prev(syms, o={}) max_order = o.fetch(:max_order, @max_order) - ar = syms.first([max_order, syms.size].min) - if @mkv.key?(ar) - @mkv[ar][:prev].random + if max_order > syms.length + max_order = syms.length + end + ar = syms.first(max_order) + # puts "raw_prev #{max_order} #{ar.inspect}" + + table = order(max_order) + sel = word(0) + where = where_selector(ar,:offset => 1) + + choices = @db.execute("select #{sel}, chance from #{table} #{where}") + unless choices.empty? + sum = @db.get_first_value("select sum(chance) from #{table} #{where}").to_i + return choices.pick_with_chance(sum) else raw_prev(ar.butlast, o) end end def prev(text, o={}) - syms = text.scan(WNW).map { |w| w.intern } + syms = text.scan(WNW) raw_prev(syms, o) end def complete_prev(text, o={}) - syms = text.scan(WNW).map { |w| w.intern } + syms = text.scan(WNW) prev = raw_prev(syms, o) while prev do syms.unshift(prev) @@ -220,7 +254,7 @@ class MarkovChainer end def complete_next(text, o={}) - syms = text.scan(WNW).map { |w| w.intern } + syms = text.scan(WNW) nxt = raw_next(syms, o) while nxt do syms.push(nxt) @@ -231,13 +265,17 @@ class MarkovChainer def complete(text, o={}) txt = text + choices = @db.execute("select word0,chance from order0") + return String.new if choices.empty? + sum = @db.get_first_value("select sum(chance) from order0").to_i while txt.empty? do - txt = @mkv[nil].random.to_s + txt = choices.pick_with_chance(sum) end - syms = txt.scan(WNW).map { |w| w.intern } + syms = [txt] prev = raw_prev(syms, o) nxt = raw_next(syms, o) while nxt or prev do + # puts syms.inspect, nxt.inspect, prev.inspect # Keep adding only on the side where we # didn't come across a nil already if prev diff --git a/test.rb b/test.rb index 37f9808..5e72461 100755 --- a/test.rb +++ b/test.rb @@ -11,7 +11,7 @@ max_ord = ARGV.fetch(1, 5).to_i min_ord = ARGV.fetch(2, 5).to_i -mkv = MarkovChainer.new(max_ord) +mkv = MarkovChainer.new('provola.db', max_ord) size = File.size?(fname) @@ -20,12 +20,13 @@ return unless size old_ratio = 0 File.open(fname) { |file| file.each { |line| + time = Time.now mkv.learn(line.chomp) new_ratio = file.pos*100/size if new_ratio > old_ratio old_ratio = new_ratio puts "\n\n\nLearned #{new_ratio}%" - puts "%u words known" % mkv.words.length + puts "%u words known" % mkv.num_words min_ord.upto(max_ord) { |ord| puts "\nOrder #{ord}::" puts mkv.complete("", :max_order=>ord) -- 2.32.0.93.g670b81a890