From 5b10920b6e38614ceea0cd97031ab48f4f1f9a39 Mon Sep 17 00:00:00 2001 From: dakkar Date: Sun, 29 Dec 2019 13:24:00 +0000 Subject: new module extension --- lib/MaildirIndexer/Index/ByAddresses.rakumod | 92 ++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 lib/MaildirIndexer/Index/ByAddresses.rakumod (limited to 'lib/MaildirIndexer/Index/ByAddresses.rakumod') diff --git a/lib/MaildirIndexer/Index/ByAddresses.rakumod b/lib/MaildirIndexer/Index/ByAddresses.rakumod new file mode 100644 index 0000000..b83a239 --- /dev/null +++ b/lib/MaildirIndexer/Index/ByAddresses.rakumod @@ -0,0 +1,92 @@ +use v6.d; +use MaildirIndexer::Index; +unit class MaildirIndexer::Index::ByAddresses does MaildirIndexer::Index; +use MaildirIndexer::LogTimelineSchema; +use MaildirIndexer::Email; + +# most of this is copied from +# p6-Algorithm-NaiveBayes:auth:ver<0.04>, in particular the +# Algorithm::NaiveBayes::Classifier::Bernoulli class + +has Array[Str] %!addresses-for-file; +# I'd like to type-constrain these BagHash-es, but the compiler +# currently dies if I try +has BagHash $!count-by-address-and-mailbox .= new; +has BagHash $!known-addresses .= new; +has BagHash $!count-by-mailbox .= new; +has $!total-count; + +method dump() { +} + +submethod account-for(Str @addresses,Str $mailbox,Int $step) { + $!total-count += $step; + $!count-by-mailbox{$mailbox} += $step; + + for @addresses -> Str $addr { + $!known-addresses{$addr} += $step; + $!count-by-address-and-mailbox{$addr => $mailbox} += $step; + } +} + +method add-mail(MaildirIndexer::Email:D $email, Str:D $mailbox --> Nil) { + MaildirIndexer::LogTimelineSchema::Index::Add.log: :class('ByAddresses'),:$mailbox, -> { + my Str @addresses = $email.addresses or return; + %!addresses-for-file{ $email.path } = @addresses; + + self.account-for(@addresses,$mailbox,1); + + return; + } +} + +method del-path(IO:D $file, Str:D $mailbox --> Nil) { + MaildirIndexer::LogTimelineSchema::Index::Rm.log: :class('ByAddresses'),:$mailbox, -> { + my Str @addresses = %!addresses-for-file{$file.path} or return; + + self.account-for(@addresses,$mailbox,-1); + + return; + } +} + +submethod p-address-given-mailbox(Str $addr, Str $mailbox) { + my $a = 1e-15 + $!count-by-address-and-mailbox{$addr => $mailbox}; + my $b = 1 + $!count-by-mailbox{$mailbox}; + return $a / $b; +} + +submethod predict-mailbox-given-addresses(@addresses) { + my %prediction; + my Bag $addr-bag .= new(|@addresses); + + for $!count-by-mailbox.keys -> Str $mailbox { + my $p = $!count-by-mailbox{$mailbox} / $!total-count; + + for $!known-addresses.keys -> Str $addr { + my $addr-p = self.p-address-given-mailbox($addr,$mailbox); + if ($addr-bag{$addr}) { + $p *= $addr-p; + } + else { + $p *= 1 - $addr-p; + } + } + + %prediction{$mailbox} = $p; + } + + return %prediction; +} + +method mailbox-for-email(MaildirIndexer::Email:D $email --> Str) { + my Str $result; + MaildirIndexer::LogTimelineSchema::Index::Find.log: :class('ByAddresses'), -> { + my %prediction = self.predict-mailbox-given-addresses($email.addresses); + + my @most-probable-mailboxes = %prediction.pairs.sort(*.value); + + if @most-probable-mailboxes -> $_ { $result = .[*-1].key } + } + return $result; +} -- cgit v1.2.3