summaryrefslogtreecommitdiff
path: root/lib/MaildirIndexer/Index/ByAddresses.pm6
blob: 4f24b374bfc3f568c0c0b1baeb74deffc55eae2f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
use v6.d;
use MaildirIndexer::Index;
unit class MaildirIndexer::Index::ByAddresses does MaildirIndexer::Index;
use MaildirIndexer::LogTimelineSchema;
use MaildirIndexer::Email;
 
# most of this is copied from 
# p6-Algorithm-NaiveBayes:auth<cpan:TITSUKI>:ver<0.04>, in particular the 
# Algorithm::NaiveBayes::Classifier::Bernoulli class 
 
has Array[Str%!addresses-for-file;
# I'd like to type-constrain these BagHash-es, but the compiler 
# currently dies if I try 
has BagHash $!count-by-address-and-mailbox .= new;
has BagHash $!known-addresses .= new;
has BagHash $!count-by-mailbox .= new;
has $!total-count;
 
method dump() {
}
 
submethod account-for(Str @addresses,Str $mailbox,Int $step{
    $!total-count += $step;
    $!count-by-mailbox{$mailbox} += $step;
 
    for @addresses -> Str $addr {
        $!known-addresses{$addr} += $step;
        $!count-by-address-and-mailbox{$addr => $mailbox} += $step;
    }
}
 
method add-mail(MaildirIndexer::Email:D $emailStr:D $mailbox --> Nil{
    MaildirIndexer::LogTimelineSchema::Index::Add.log: :class('ByAddresses'),:$mailbox-> {
        my Str @addresses = $email.addresses or return;
        %!addresses-for-file{ $email.path } = @addresses;
 
        self.account-for(@addresses,$mailbox,1);
 
        return;
    }
}
 
method del-path(IO:D $fileStr:D $mailbox --> Nil{
    MaildirIndexer::LogTimelineSchema::Index::Rm.log: :class('ByAddresses'),:$mailbox-> {
        my Str @addresses = %!addresses-for-file{$file.path} or return;
 
        self.account-for(@addresses,$mailbox,-1);
 
        return;
    }
}
 
submethod p-address-given-mailbox(Str $addrStr $mailbox{
    my $a = 1 + $!count-by-address-and-mailbox{$addr => $mailbox};
    my $b = 2 + $!count-by-mailbox{$mailbox};
    return $a / $b;
}
 
submethod predict-mailbox-given-addresses(@addresses{
    my %prediction;
    my Bag $addr-bag .= new(|@addresses);
 
    for $!count-by-mailbox.keys -> Str $mailbox {
        my $p = $!count-by-mailbox{$mailbox} / $!total-count;
 
        for $!known-addresses.keys -> Str $addr {
            my $addr-p = self.p-address-given-mailbox($addr,$mailbox);
            if ($addr-bag{$addr}{
                $p *= $addr-p;
            }
            else {
                $p *= 1 - $addr-p;
            }
        }
 
        %prediction{$mailbox} = $p;
    }
 
    return %prediction;
}
 
method mailbox-for-email(MaildirIndexer::Email:D $email --> Str{
    my Str $result;
    MaildirIndexer::LogTimelineSchema::Index::Find.log: :class('ByAddresses'), -> {
        my %prediction = self.predict-mailbox-given-addresses($email.addresses);
 
        my @most-probable-mailboxes = %prediction.pairs.sort(*.value);
 
        if @most-probable-mailboxes -> $_ { $result = .[*-1].key }
    }
    return $result;
}