summaryrefslogtreecommitdiff
path: root/maildir-indexer.pl
diff options
context:
space:
mode:
Diffstat (limited to 'maildir-indexer.pl')
-rw-r--r--maildir-indexer.pl388
1 files changed, 388 insertions, 0 deletions
diff --git a/maildir-indexer.pl b/maildir-indexer.pl
new file mode 100644
index 0000000..1a7a4fc
--- /dev/null
+++ b/maildir-indexer.pl
@@ -0,0 +1,388 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use threads;
+use threads::shared;
+use IO::Socket::INET;
+use File::Next;
+use Email::Simple;
+
+=head1 NAME
+
+maildir-indexer.pl
+
+=head1 AUTHOR
+
+Gianni Ceccarelli <dakkar@thenautilus.net>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2009 Gianni Ceccarelli
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+For a copy of the GNU General Public License, see
+L<http://www.gnu.org/licenses/>.
+
+=cut
+
+{
+
+=head1 Index handling
+
+C<%files2id> maps a mailbox-file pair to a message id
+(C<$files2id{$mailbox}->{$file}=$id>).
+
+C<%id2mailbox> maps a message id to a list of mailboxes.
+
+These hashes are private to the index handling section: the following
+function must be used to manipulate the index.
+
+=cut
+
+my %files2id : shared;
+my %id2mailbox : shared;
+
+=head2 C<add_file>
+
+ add_file($mailbox,$file,$id);
+
+Adds an entry to the index. Takes care of locking, and of sharing
+newly-created hashes and arrays.
+
+=cut
+
+sub add_file {
+ my ($mailbox,$file,$id)=@_;
+ lock(%files2id);
+
+ # Auto-vivified references are not shared, we must share them explicitly.
+ # See threads::shared for an explanation of the '&share' call style.
+
+ $files2id{$mailbox}||=&share({});
+ $files2id{$mailbox}->{$file}=$id;
+
+ $id2mailbox{$id}||=&share([]);
+ push @{$id2mailbox{$id}},$mailbox;
+
+ return;
+}
+
+=head2 C<del_file>
+
+ del_file($mailbox,$file);
+
+Removes an entry from the index. Takes care of locking, and of
+removing empty hashes and arrays.
+
+=cut
+
+sub del_file {
+ my ($mailbox,$file)=@_;
+ lock(%files2id);
+
+ return unless exists $files2id{$mailbox};
+ return unless exists $files2id{$mailbox}->{$file};
+ my $id=delete $files2id{$mailbox}->{$file};
+
+ return unless exists $id2mailbox{$id};
+ # remove the mailbox, keeping the order
+ @{$id2mailbox{$id}}=grep {$_ ne $mailbox} @{$id2mailbox{$id}};
+ delete $id2mailbox{$id} unless @{$id2mailbox{$id}};
+
+ return;
+}
+
+=head2 C<get_mailboxes>
+
+ @mailboxes = get_mailboxes($id);
+
+Returns all the mailboxes that contain a message with the given
+message id, in roughly chronological order. Can return an empty list.
+
+=cut
+
+sub get_mailboxes {
+ my ($id)=@_;
+ lock(%files2id);
+
+ return unless exists $id2mailbox{$id};
+ return @{$id2mailbox{$id}};
+}
+
+}
+
+=head1 Message & mailbox parsing
+
+=head2 C<id_from_file>
+
+ $id = id_from_file($dir,$file);
+
+Returns the message id of the C<$file> in the C<$dir>. Returns
+C<undef> if the file can't be read, or it does not contain a message
+id.
+
+=cut #'
+
+sub id_from_file {
+ my ($dir,$file)=@_;
+
+ # we might use Email::Simple here, but it's useless to load
+ # and parse the entire message
+
+ open my $fh,'<',"$dir/$file" or return;
+ while (my $line=<$fh>) {
+ $line=~s{[\x0d\x0a]+\z}{};
+ return unless length($line); # end of headers, no messageid found
+ $line =~ m{^Message-ID: \s* <(\S+?)>}xi and return $1;
+ }
+ return;
+}
+
+=head2 C<split_refs>
+
+ @ids = split_refs($header_field_value);
+
+Extracts the message ids from the value of a header file such as
+C<In-Reply-To:> or C<References:>.
+
+=cut
+
+sub split_refs {
+ my ($str)=@_;
+ return unless $str;
+
+ $str=~s{^.*<}{};$str=~s{>.*$}{};
+ return split />.*?</,$str;
+}
+
+=head2 C<mailbox_from_path>
+
+ $mailbox = mailbox_from_path($dir);
+
+Extracts the mailbox name from a directory name. Assumes that all
+mailboxes are under a C<.maildir/> directory. Returns C<undef> if the
+directory name can't be parsed.
+
+=cut #'
+
+sub mailbox_from_path {
+ my ($dir)=@_;
+
+ # /home/dakkar/.maildir/.mail.Personal.Dakkar/cur
+ # ->
+ # .mail.Personal.Dakkar
+ $dir=~m{/.maildir/(.*?)/(?:cur|new|tmp)} and return $1;
+ return;
+}
+
+=head2 C<ignorable>
+
+ if (ignorable($mailbox)) { ... }
+
+Returns whether a mailbox can be safely ignored by the indexer. There
+are two ignorable mailboxes: C<.Trash> and C<.mail.SPAM>.
+
+=cut
+
+sub ignorable {
+ return $_[0] =~ m{(?:^|/)(?:\.mail\.SPAM|\.Trash)(?:/|$)};
+}
+
+=head1 Scanning the mail archive
+
+=head2 C<scan_directory>
+
+Using L<File::Next>, scans C<$ENV{HOME}/.maildir> for message files,
+and adds them to the index.
+
+=cut
+
+sub scan_directory {
+ my $files=File::Next::files({
+ file_filter => sub {
+ # /cur/ and /new/ are the only interesting directories in a maildir
+ ($File::Next::dir =~ m{/(?:cur|new)(?:/|$)})
+ and
+ # we ignore dovecot's files
+ ($_ !~ m{^dovecot});
+ },
+ descend_filter => sub {
+ # ignorable mailboxes are ignored
+ !ignorable($_);
+ },
+ follow_symlinks=>0,
+ }, "$ENV{HOME}/.maildir");
+
+ my ($dir,$file,$fullpath,$mailbox,$id);
+ while (($dir,$file,$fullpath)=$files->()) {
+ $mailbox=mailbox_from_path($dir);
+ next unless defined $mailbox; # might not be a real mailbox
+ next if ignorable($mailbox); # should not happen
+ $id=id_from_file($dir,$file);
+ next unless defined $id; # might not be a message, or have disappeared
+ add_file($mailbox,$file,$id);
+ }
+
+ return;
+}
+
+{
+
+=head1 Watching the mail archive for changes
+
+=cut
+
+my $child_pid : shared;
+# cleanup action
+END {
+ kill 2,$child_pid if $child_pid;
+}
+
+=head2 C<watch_directory>
+
+Parses the output from C<inotifywait>. For the events C<CREATED> and
+C<MOVED_TO>, adds entries to the index. For the events C<DELETED> and
+C<MOVED_FROM>, removes entries from the index.
+
+=cut
+
+sub watch_directory {
+ # see inotifywait(1)
+ $child_pid=
+ open my $inotify,'-|',
+ qw(inotifywait -q -m -r
+ --exclude (^|/)(dovecot|tmp/).*
+ -c
+ -e move -e delete -e create),
+ "$ENV{HOME}/.maildir"
+ or die "Can't start inotifywait: $!";
+
+ while (my $line=<$inotify>) {
+ chomp($line);
+
+ # these lines are a naif parser for the CSV output of inotifywait
+ my ($dir,$event,$file)=
+ ($line =~ m{^ ( (?:".*?") | [^,]* ) ,
+ ( (?:".*?") | [^,]* ) ,
+ ( (?:".*?") | [^,]* ) $ }smx );
+ s{(?:\A")|(?:"\z)}{}g for $dir,$event,$file;
+
+ my $mailbox=mailbox_from_path($dir);
+ next unless defined $mailbox; # might not be a mailbox event
+ next if ignorable($mailbox); # ignore ignorable mailboxes
+
+ if ($event =~ /CREATE|MOVED_TO/) {
+ # a file has appeared: get the message id
+ my $id=id_from_file($dir,$file);
+ if (defined $id) {
+ # it is a proper message, add it to the index
+ add_file($mailbox,$file,$id);
+ }
+ }
+ elsif ($event =~ /DELETE|MOVED_FROM/) {
+ # a file has disappeared: remove it from the index
+ del_file($mailbox,$file);
+ }
+ }
+}
+
+}
+
+=head1 Serving requests
+
+=head2 C<server>
+
+Opens a listening socket on C<localhost:9000>. Whenever a connection
+is received, executes C<handle_client> in a separate thread.
+
+=cut
+
+sub server {
+ my $serv_sock=IO::Socket::INET->new(
+ Listen=>10,
+ LocalAddr=>'127.0.0.1',
+ LocalPort=>9000,
+ Proto=>'tcp',
+ ReuseAddr=>1,
+ );
+ while (my $other_sock=$serv_sock->accept) {
+ threads->create(\&handle_client,$other_sock)->detach;
+ }
+}
+
+=head2 C<handle_client>
+
+This funtction expects a mail message from the socket. The message is
+parsesd, and possible references extracted. If the are references, and
+one of them is known to the index, the mailbox name of such a
+reference is printed to the socket.
+
+=cut
+
+sub handle_client {
+ my ($client)=@_;
+
+ # we get a message header, followed by a blank line
+ my $header;
+ while (my $line=<$client>) {
+ $line=~s{[\x0d\x0a]+\z}{\x0d\x0a};
+ $header.=$line;
+ last if $line eq "\x0d\x0a";
+ }
+
+ my $header_obj=Email::Simple->new(\$header)->header_obj;
+ # In-Reply-To is the most trustworthy
+ # References are listed in chronological order
+ # @refs ends up with the ids in decreasing order of importance
+ my @refs=(split_refs($header_obj->header('In-Reply-To')),
+ reverse(split_refs($header_obj->header('References'))));
+
+ my @mailboxes;
+ for my $id (@refs) {
+ @mailboxes=get_mailboxes($id);
+ if (@mailboxes) {
+ # we know where a message is! tell the client
+ # (we only print the most recent mailbox)
+ # the client may have disconnected: printing to a
+ # closed socket makes the server die
+ print {$client} "$mailboxes[-1]\x0d\x0a"
+ if $client->connected;
+
+ return;
+ }
+ }
+ return;
+}
+
+=head1 Main program
+
+=over 4
+
+=item *
+
+start C<watch_directory> in a thread
+
+=item *
+
+start C<scan_directory> in another thread
+
+=item *
+
+start C<server> in the main thread
+
+=back
+
+=cut
+
+threads->create(\&watch_directory)->detach;
+threads->create(\&scan_directory)->detach;
+server();