summaryrefslogtreecommitdiff
path: root/lib/MaildirIndexer/Index/ByAddresses.rakumod
diff options
context:
space:
mode:
authordakkar <dakkar@thenautilus.net>2019-12-29 13:24:00 +0000
committerdakkar <dakkar@thenautilus.net>2019-12-29 13:45:21 +0000
commit5b10920b6e38614ceea0cd97031ab48f4f1f9a39 (patch)
tree031a9a787823b6a2913824235a77f86f76a94d88 /lib/MaildirIndexer/Index/ByAddresses.rakumod
parentit's called `raku` now (diff)
downloadMaildirIndexer-5b10920b6e38614ceea0cd97031ab48f4f1f9a39.tar.gz
MaildirIndexer-5b10920b6e38614ceea0cd97031ab48f4f1f9a39.tar.bz2
MaildirIndexer-5b10920b6e38614ceea0cd97031ab48f4f1f9a39.zip
new module extension
Diffstat (limited to 'lib/MaildirIndexer/Index/ByAddresses.rakumod')
-rw-r--r--lib/MaildirIndexer/Index/ByAddresses.rakumod92
1 files changed, 92 insertions, 0 deletions
diff --git a/lib/MaildirIndexer/Index/ByAddresses.rakumod b/lib/MaildirIndexer/Index/ByAddresses.rakumod
new file mode 100644
index 0000000..b83a239
--- /dev/null
+++ b/lib/MaildirIndexer/Index/ByAddresses.rakumod
@@ -0,0 +1,92 @@
+use v6.d;
+use MaildirIndexer::Index;
+unit class MaildirIndexer::Index::ByAddresses does MaildirIndexer::Index;
+use MaildirIndexer::LogTimelineSchema;
+use MaildirIndexer::Email;
+
+# most of this is copied from
+# p6-Algorithm-NaiveBayes:auth<cpan:TITSUKI>:ver<0.04>, in particular the
+# Algorithm::NaiveBayes::Classifier::Bernoulli class
+
+has Array[Str] %!addresses-for-file;
+# I'd like to type-constrain these BagHash-es, but the compiler
+# currently dies if I try
+has BagHash $!count-by-address-and-mailbox .= new;
+has BagHash $!known-addresses .= new;
+has BagHash $!count-by-mailbox .= new;
+has $!total-count;
+
+method dump() {
+}
+
+submethod account-for(Str @addresses,Str $mailbox,Int $step) {
+ $!total-count += $step;
+ $!count-by-mailbox{$mailbox} += $step;
+
+ for @addresses -> Str $addr {
+ $!known-addresses{$addr} += $step;
+ $!count-by-address-and-mailbox{$addr => $mailbox} += $step;
+ }
+}
+
+method add-mail(MaildirIndexer::Email:D $email, Str:D $mailbox --> Nil) {
+ MaildirIndexer::LogTimelineSchema::Index::Add.log: :class('ByAddresses'),:$mailbox, -> {
+ my Str @addresses = $email.addresses or return;
+ %!addresses-for-file{ $email.path } = @addresses;
+
+ self.account-for(@addresses,$mailbox,1);
+
+ return;
+ }
+}
+
+method del-path(IO:D $file, Str:D $mailbox --> Nil) {
+ MaildirIndexer::LogTimelineSchema::Index::Rm.log: :class('ByAddresses'),:$mailbox, -> {
+ my Str @addresses = %!addresses-for-file{$file.path} or return;
+
+ self.account-for(@addresses,$mailbox,-1);
+
+ return;
+ }
+}
+
+submethod p-address-given-mailbox(Str $addr, Str $mailbox) {
+ my $a = 1e-15 + $!count-by-address-and-mailbox{$addr => $mailbox};
+ my $b = 1 + $!count-by-mailbox{$mailbox};
+ return $a / $b;
+}
+
+submethod predict-mailbox-given-addresses(@addresses) {
+ my %prediction;
+ my Bag $addr-bag .= new(|@addresses);
+
+ for $!count-by-mailbox.keys -> Str $mailbox {
+ my $p = $!count-by-mailbox{$mailbox} / $!total-count;
+
+ for $!known-addresses.keys -> Str $addr {
+ my $addr-p = self.p-address-given-mailbox($addr,$mailbox);
+ if ($addr-bag{$addr}) {
+ $p *= $addr-p;
+ }
+ else {
+ $p *= 1 - $addr-p;
+ }
+ }
+
+ %prediction{$mailbox} = $p;
+ }
+
+ return %prediction;
+}
+
+method mailbox-for-email(MaildirIndexer::Email:D $email --> Str) {
+ my Str $result;
+ MaildirIndexer::LogTimelineSchema::Index::Find.log: :class('ByAddresses'), -> {
+ my %prediction = self.predict-mailbox-given-addresses($email.addresses);
+
+ my @most-probable-mailboxes = %prediction.pairs.sort(*.value);
+
+ if @most-probable-mailboxes -> $_ { $result = .[*-1].key }
+ }
+ return $result;
+}