use v6.d; use MaildirIndexer::Index; unit class MaildirIndexer::Index::ByAddresses does MaildirIndexer::Index; use MaildirIndexer::LogTimelineSchema; use MaildirIndexer::Email; # most of this is copied from # p6-Algorithm-NaiveBayes:auth:ver<0.04>, in particular the # Algorithm::NaiveBayes::Classifier::Bernoulli class has Array[Str] %!addresses-for-file; has BagHash $!count-by-address-and-mailbox .= new; has BagHash $!known-addresses .= new; has BagHash $!count-by-mailbox .= new; has Numeric %!p-address-given-mailbox; has Int $!total-count; has Numeric %!cached-p-given-mailbox; has Supplier $!update-cache .= new; constant $NOT-ZERO = 1e-15; submethod TWEAK() { $!update-cache.Supply.stable(10).tap({ self.do-update-cache() }); } method dump() { } submethod account-for(Str @addresses,Str $mailbox,Int $step) { $!total-count += $step; $!count-by-mailbox{$mailbox} += $step; for @addresses -> Str $addr { my Pair $pair = ( $addr<> => $mailbox<> ); $!known-addresses{$addr} += $step; my Numeric $count = $!count-by-address-and-mailbox{$pair} += $step; if ($count) { my Numeric $a = $NOT-ZERO + $count; my Numeric $b = 1 + $!count-by-mailbox{$mailbox}; %!p-address-given-mailbox{$pair} = $a / $b; } else { %!p-address-given-mailbox{$pair} :delete; } } $!update-cache.emit(Any); } submethod do-update-cache() { MaildirIndexer::LogTimelineSchema::Index::Cache.log: { for $!count-by-mailbox.keys -> Str $mailbox { my Numeric $p = $!count-by-mailbox{$mailbox} / $!total-count; for $!known-addresses.keys -> Str $addr { my $addr-p = %!p-address-given-mailbox{$addr<> => $mailbox<>} // $NOT-ZERO; $p *= 1 - $addr-p; } %!cached-p-given-mailbox{$mailbox} = $p; } } } method add-mail(MaildirIndexer::Email:D $email, Str:D $mailbox --> Nil) { MaildirIndexer::LogTimelineSchema::Index::Add.log: :class('ByAddresses'),:$mailbox, -> { # ignore adding the same file twice, files in maildirs are # immutable return if %!addresses-for-file{ $email.path }:exists; my Str @addresses = $email.addresses or return; %!addresses-for-file{ $email.path } := @addresses; self.account-for(@addresses,$mailbox,1); return; } } method del-path(IO:D $file, Str:D $mailbox --> Nil) { MaildirIndexer::LogTimelineSchema::Index::Rm.log: :class('ByAddresses'),:$mailbox, -> { # using assignment would fail when the path isn't present in # the hash, because it tries to assign the (undefined) # Array[Str] as a single element, instead of splatting it; # also, binding is faster because it avoids a copy my Str @addresses := %!addresses-for-file{$file.path}:delete or return; self.account-for(@addresses,$mailbox,-1); return; } } submethod predict-mailbox-given-addresses(@addresses) { my Numeric %prediction; for $!count-by-mailbox.keys -> Str $mailbox { my Numeric $p = %!cached-p-given-mailbox{$mailbox} // $NOT-ZERO; for @addresses -> Str $addr { my $addr-p = %!p-address-given-mailbox{$addr<> => $mailbox<>} // $NOT-ZERO; $p *= $addr-p / ( 1- $addr-p ); } %prediction{$mailbox} = $p; } return %prediction; } method mailbox-for-email(MaildirIndexer::Email:D $email --> Str) { my Str $result; MaildirIndexer::LogTimelineSchema::Index::Find.log: :class('ByAddresses'), -> { my %prediction = self.predict-mailbox-given-addresses($email.addresses); my @most-probable-mailboxes = %prediction.pairs.sort(*.value); if @most-probable-mailboxes -> $_ { $result = .[*-1].key } } return $result; }