From 86f7c7f6b63b24210ec3ed3e88d9d0f53d33f805 Mon Sep 17 00:00:00 2001 From: dakkar Date: Sat, 28 Dec 2019 15:27:03 +0000 Subject: naive-bayes classifying index half-tested by hand --- bayes | 6 ++- lib/MaildirIndexer/Index/ByAddresses.pm6 | 89 ++++++++++++++++++++++++++++++++ t/parser.t | 4 +- 3 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 lib/MaildirIndexer/Index/ByAddresses.pm6 diff --git a/bayes b/bayes index a99a3e9..2601874 100644 --- a/bayes +++ b/bayes @@ -5,12 +5,16 @@ use MaildirIndexer::ScanDir; use MaildirIndexer::Store; use MaildirIndexer::Server; use MaildirIndexer::Index::ByRef; +use MaildirIndexer::Index::ByAddresses; sub MAIN($maildir) { my $file-channel = scan-dir($maildir).Channel; my $store = MaildirIndexer::Store.new( :$file-channel, - indices => MaildirIndexer::Index::ByRef.new, + indices => ( + MaildirIndexer::Index::ByRef.new, + MaildirIndexer::Index::ByAddresses.new, + ), ); my $server = MaildirIndexer::Server.new(:$store); diff --git a/lib/MaildirIndexer/Index/ByAddresses.pm6 b/lib/MaildirIndexer/Index/ByAddresses.pm6 new file mode 100644 index 0000000..5636f1f --- /dev/null +++ b/lib/MaildirIndexer/Index/ByAddresses.pm6 @@ -0,0 +1,89 @@ +use v6.d; +use MaildirIndexer::Index; +unit class MaildirIndexer::Index::ByAddresses does MaildirIndexer::Index; +use MaildirIndexer::LogTimelineSchema; +use MaildirIndexer::Email; + +# most of this is copied from +# p6-Algorithm-NaiveBayes:auth:ver<0.04>, in particular the +# Algorithm::NaiveBayes::Classifier::Bernoulli class + +has Array[Str] %!addresses-for-file; +has %!count-by-address-and-mailbox; +has %!known-addresses; +has %!count-by-mailbox; +has $!total-count; + +method dump() { +} + +submethod account-for(Str @addresses,$mailbox,Int $step) { + $!total-count += $step; + %!count-by-mailbox{$mailbox} += $step; + + for @addresses -> $addr { + %!known-addresses{$addr} += $step; + %!count-by-address-and-mailbox{$addr}{$mailbox} += $step; + } +} + +method add-mail(MaildirIndexer::Email:D $email, Str:D $mailbox --> Nil) { + MaildirIndexer::LogTimelineSchema::Index::Add.log: :class('ByAddresses'),:$mailbox, -> { + my Str @addresses = $email.addresses or return; + %!addresses-for-file{ $email.path } = @addresses; + + self.account-for(@addresses,$mailbox,1); + + return; + } +} + +method del-path(IO:D $file, Str:D $mailbox --> Nil) { + MaildirIndexer::LogTimelineSchema::Index::Rm.log: :class('ByAddresses'),:$mailbox, -> { + my Str @addresses = %!addresses-for-file{$file.path} or return; + + self.account-for(@addresses,$mailbox,-1); + + return; + } +} + +submethod p-address-given-mailbox($addr,$mailbox) { + my $a = 1 + (%!count-by-address-and-mailbox{$addr}{$mailbox} // 0); + my $b = 2 + (%!count-by-mailbox{$mailbox} // 0); + return $a / $b; +} + +submethod predict-mailbox-given-addresses(@addresses) { + my %prediction; + my Bag $addr-bag .= new(|@addresses); + + for %!count-by-mailbox.keys -> $mailbox { + my $p = 1; + + for %!known-addresses.keys -> $addr { + if ($addr-bag{$addr}) { + $p *= self.p-address-given-mailbox($addr,$mailbox); + } + else { + $p *= (1 - self.p-address-given-mailbox($addr,$mailbox)); + } + } + $p *= %!count-by-mailbox{$mailbox} / $!total-count; + %prediction{$mailbox} = $p; + } + + return %prediction; +} + +method mailbox-for-email(MaildirIndexer::Email:D $email --> Str) { + my Str $result; + MaildirIndexer::LogTimelineSchema::Index::Find.log: :class('ByAddresses'), -> { + my %prediction = self.predict-mailbox-given-addresses($email.addresses); + + my @most-probable-mailboxes = %prediction.pairs.sort(*.value).map(*.key); + + if @most-probable-mailboxes -> $_ { $result = .[*-1] } + } + return $result; +} diff --git a/t/parser.t b/t/parser.t index aafbb27..49be92e 100644 --- a/t/parser.t +++ b/t/parser.t @@ -5,12 +5,12 @@ use MaildirIndexer::Parser; use MaildirIndexer::Email; subtest 'from string' => { - is-deeply parse-email(''), Nil, "empty string won't parse"; + ok !parse-email(''), "empty string won't parse"; my $message = q:to; bad stuff EOM - is-deeply parse-email($message), Nil, "bad message won't parse"; + ok !parse-email($message), "bad message won't parse"; $message = q:to; Head: value -- cgit v1.2.3