summaryrefslogtreecommitdiff
path: root/lib/MaildirIndexer/Index/ByAddresses.pm6
blob: 5636f1f2b16491a9997d9ab6d043a68a8d3acb4c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
use v6.d;
use MaildirIndexer::Index;
unit class MaildirIndexer::Index::ByAddresses does MaildirIndexer::Index;
use MaildirIndexer::LogTimelineSchema;
use MaildirIndexer::Email;
 
# most of this is copied from 
# p6-Algorithm-NaiveBayes:auth<cpan:TITSUKI>:ver<0.04>, in particular the 
# Algorithm::NaiveBayes::Classifier::Bernoulli class 
 
has Array[Str%!addresses-for-file;
has %!count-by-address-and-mailbox;
has %!known-addresses;
has %!count-by-mailbox;
has $!total-count;
 
method dump() {
}
 
submethod account-for(Str @addresses,$mailbox,Int $step{
    $!total-count += $step;
    %!count-by-mailbox{$mailbox} += $step;
 
    for @addresses -> $addr {
        %!known-addresses{$addr} += $step;
        %!count-by-address-and-mailbox{$addr}{$mailbox} += $step;
    }
}
 
method add-mail(MaildirIndexer::Email:D $emailStr:D $mailbox --> Nil{
    MaildirIndexer::LogTimelineSchema::Index::Add.log: :class('ByAddresses'),:$mailbox-> {
        my Str @addresses = $email.addresses or return;
        %!addresses-for-file{ $email.path } = @addresses;
 
        self.account-for(@addresses,$mailbox,1);
 
        return;
    }
}
 
method del-path(IO:D $fileStr:D $mailbox --> Nil{
    MaildirIndexer::LogTimelineSchema::Index::Rm.log: :class('ByAddresses'),:$mailbox-> {
        my Str @addresses = %!addresses-for-file{$file.path} or return;
 
        self.account-for(@addresses,$mailbox,-1);
 
        return;
    }
}
 
submethod p-address-given-mailbox($addr,$mailbox{
    my $a = 1 + (%!count-by-address-and-mailbox{$addr}{$mailbox} // 0);
    my $b = 2 + (%!count-by-mailbox{$mailbox} // 0);
    return $a / $b;
}
 
submethod predict-mailbox-given-addresses(@addresses{
    my %prediction;
    my Bag $addr-bag .= new(|@addresses);
 
    for %!count-by-mailbox.keys -> $mailbox {
        my $p = 1;
 
        for %!known-addresses.keys -> $addr {
            if ($addr-bag{$addr}{
                $p *= self.p-address-given-mailbox($addr,$mailbox);
            }
            else {
                $p *= (1 - self.p-address-given-mailbox($addr,$mailbox));
            }
        }
        $p *= %!count-by-mailbox{$mailbox} / $!total-count;
        %prediction{$mailbox} = $p;
    }
 
    return %prediction;
}
 
method mailbox-for-email(MaildirIndexer::Email:D $email --> Str{
    my Str $result;
    MaildirIndexer::LogTimelineSchema::Index::Find.log: :class('ByAddresses'), -> {
        my %prediction = self.predict-mailbox-given-addresses($email.addresses);
 
        my @most-probable-mailboxes = %prediction.pairs.sort(*.value).map(*.key);
 
        if @most-probable-mailboxes -> $_ { $result = .[*-1}
    }
    return $result;
}