summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordakkar <dakkar@thenautilus.net>2019-12-28 15:27:03 +0000
committerdakkar <dakkar@thenautilus.net>2019-12-28 15:31:23 +0000
commit86f7c7f6b63b24210ec3ed3e88d9d0f53d33f805 (patch)
tree354e0bd4ef89506d994c485d1ef50f7ab85a1b84
parenttest multi-index store (diff)
downloadMaildirIndexer-86f7c7f6b63b24210ec3ed3e88d9d0f53d33f805.tar.gz
MaildirIndexer-86f7c7f6b63b24210ec3ed3e88d9d0f53d33f805.tar.bz2
MaildirIndexer-86f7c7f6b63b24210ec3ed3e88d9d0f53d33f805.zip
naive-bayes classifying index
half-tested by hand
-rw-r--r--bayes6
-rw-r--r--lib/MaildirIndexer/Index/ByAddresses.pm689
-rw-r--r--t/parser.t4
3 files changed, 96 insertions, 3 deletions
diff --git a/bayes b/bayes
index a99a3e9..2601874 100644
--- a/bayes
+++ b/bayes
@@ -5,12 +5,16 @@ use MaildirIndexer::ScanDir;
use MaildirIndexer::Store;
use MaildirIndexer::Server;
use MaildirIndexer::Index::ByRef;
+use MaildirIndexer::Index::ByAddresses;
sub MAIN($maildir) {
my $file-channel = scan-dir($maildir).Channel;
my $store = MaildirIndexer::Store.new(
:$file-channel,
- indices => MaildirIndexer::Index::ByRef.new,
+ indices => (
+ MaildirIndexer::Index::ByRef.new,
+ MaildirIndexer::Index::ByAddresses.new,
+ ),
);
my $server = MaildirIndexer::Server.new(:$store);
diff --git a/lib/MaildirIndexer/Index/ByAddresses.pm6 b/lib/MaildirIndexer/Index/ByAddresses.pm6
new file mode 100644
index 0000000..5636f1f
--- /dev/null
+++ b/lib/MaildirIndexer/Index/ByAddresses.pm6
@@ -0,0 +1,89 @@
+use v6.d;
+use MaildirIndexer::Index;
+unit class MaildirIndexer::Index::ByAddresses does MaildirIndexer::Index;
+use MaildirIndexer::LogTimelineSchema;
+use MaildirIndexer::Email;
+
+# most of this is copied from
+# p6-Algorithm-NaiveBayes:auth<cpan:TITSUKI>:ver<0.04>, in particular the
+# Algorithm::NaiveBayes::Classifier::Bernoulli class
+
+has Array[Str] %!addresses-for-file;
+has %!count-by-address-and-mailbox;
+has %!known-addresses;
+has %!count-by-mailbox;
+has $!total-count;
+
+method dump() {
+}
+
+submethod account-for(Str @addresses,$mailbox,Int $step) {
+ $!total-count += $step;
+ %!count-by-mailbox{$mailbox} += $step;
+
+ for @addresses -> $addr {
+ %!known-addresses{$addr} += $step;
+ %!count-by-address-and-mailbox{$addr}{$mailbox} += $step;
+ }
+}
+
+method add-mail(MaildirIndexer::Email:D $email, Str:D $mailbox --> Nil) {
+ MaildirIndexer::LogTimelineSchema::Index::Add.log: :class('ByAddresses'),:$mailbox, -> {
+ my Str @addresses = $email.addresses or return;
+ %!addresses-for-file{ $email.path } = @addresses;
+
+ self.account-for(@addresses,$mailbox,1);
+
+ return;
+ }
+}
+
+method del-path(IO:D $file, Str:D $mailbox --> Nil) {
+ MaildirIndexer::LogTimelineSchema::Index::Rm.log: :class('ByAddresses'),:$mailbox, -> {
+ my Str @addresses = %!addresses-for-file{$file.path} or return;
+
+ self.account-for(@addresses,$mailbox,-1);
+
+ return;
+ }
+}
+
+submethod p-address-given-mailbox($addr,$mailbox) {
+ my $a = 1 + (%!count-by-address-and-mailbox{$addr}{$mailbox} // 0);
+ my $b = 2 + (%!count-by-mailbox{$mailbox} // 0);
+ return $a / $b;
+}
+
+submethod predict-mailbox-given-addresses(@addresses) {
+ my %prediction;
+ my Bag $addr-bag .= new(|@addresses);
+
+ for %!count-by-mailbox.keys -> $mailbox {
+ my $p = 1;
+
+ for %!known-addresses.keys -> $addr {
+ if ($addr-bag{$addr}) {
+ $p *= self.p-address-given-mailbox($addr,$mailbox);
+ }
+ else {
+ $p *= (1 - self.p-address-given-mailbox($addr,$mailbox));
+ }
+ }
+ $p *= %!count-by-mailbox{$mailbox} / $!total-count;
+ %prediction{$mailbox} = $p;
+ }
+
+ return %prediction;
+}
+
+method mailbox-for-email(MaildirIndexer::Email:D $email --> Str) {
+ my Str $result;
+ MaildirIndexer::LogTimelineSchema::Index::Find.log: :class('ByAddresses'), -> {
+ my %prediction = self.predict-mailbox-given-addresses($email.addresses);
+
+ my @most-probable-mailboxes = %prediction.pairs.sort(*.value).map(*.key);
+
+ if @most-probable-mailboxes -> $_ { $result = .[*-1] }
+ }
+ return $result;
+}
diff --git a/t/parser.t b/t/parser.t
index aafbb27..49be92e 100644
--- a/t/parser.t
+++ b/t/parser.t
@@ -5,12 +5,12 @@ use MaildirIndexer::Parser;
use MaildirIndexer::Email;
subtest 'from string' => {
- is-deeply parse-email(''), Nil, "empty string won't parse";
+ ok !parse-email(''), "empty string won't parse";
my $message = q:to<EOM>;
bad stuff
EOM
- is-deeply parse-email($message), Nil, "bad message won't parse";
+ ok !parse-email($message), "bad message won't parse";
$message = q:to<EOM>;
Head: value