From 0b356d2e192fa531fb149cfd89f9d93324a90ebe Mon Sep 17 00:00:00 2001 From: Gianni Ceccarelli Date: Wed, 22 Feb 2023 13:53:11 +0000 Subject: store misskey notes & people --- cpanfile | 11 ++- lib/Dakkar/Misskey.pm | 18 ++-- lib/Dakkar/NotesArchive.pm | 3 +- lib/Dakkar/TweetArchive/Store.pm | 199 +++++++++++++++++++++++++++++++++++++-- tweet-archive.pl | 72 ++++++++++---- 5 files changed, 257 insertions(+), 46 deletions(-) diff --git a/cpanfile b/cpanfile index 66ba8c6..38449c5 100644 --- a/cpanfile +++ b/cpanfile @@ -1,15 +1,16 @@ #!perl requires 'perl', '>= 5.036'; +requires 'DBD::Pg'; +requires 'DBI'; requires 'DateTime::Format::Pg'; requires 'DateTime::Format::Strptime'; -requires 'DBI'; -requires 'DBD::Pg'; -requires 'experimental'; requires 'JSON::MaybeXS'; +requires 'List::Util'; requires 'Moo'; requires 'Moo::Role'; -requires 'namespace::clean'; requires 'Net::Twitter'; requires 'Path::Tiny'; +requires 'PerlX::Maybe'; requires 'Types::Standard'; -requires 'List::Util'; +requires 'experimental'; +requires 'namespace::clean'; diff --git a/lib/Dakkar/Misskey.pm b/lib/Dakkar/Misskey.pm index 8d6dd5e..327cfe8 100644 --- a/lib/Dakkar/Misskey.pm +++ b/lib/Dakkar/Misskey.pm @@ -10,7 +10,7 @@ use URI; use namespace::clean; has _json => ( is => 'lazy', builder => sub { JSON::MaybeXS->new(utf8=>1,relaxed=>1, pretty=>0) } ); -has _ua => ( is => 'lazy', builder => sub { LWP::UserAgent->new(agent=>'Dakkar::Misskey') } ); +has ua => ( is => 'lazy', builder => sub { LWP::UserAgent->new(agent=>'Dakkar::Misskey') } ); has token => ( is => 'ro', required => 1 ); has base_url => ( is => 'ro', required => 1, isa => Uri, coerce => 1 ); @@ -23,22 +23,19 @@ sub _request($self, $endpoint, $payload) { my $uri = URI->new($endpoint)->abs($self->base_url); - my $response = $self->_ua->post( + my $response = $self->ua->post( $uri, 'Content-type' => 'application/json', Content => $payload_json, ); if ($response->is_success) { - return( - $self->_json->decode( - $response->decoded_content(charset=>'none') - ), - undef, + return $self->_json->decode( + $response->decoded_content(charset=>'none') ); } - return( undef, $response->status_line ); + die $response->status_line; } sub _paged_request($self, $endpoint, $payload) { @@ -50,8 +47,7 @@ sub _paged_request($self, $endpoint, $payload) { }; while (1) { - my ($result, $error) = $self->_request($endpoint, $page_payload); - return (\@all_results, $error) if $error; + my $result = $self->_request($endpoint, $page_payload); last unless $result->@*; @@ -59,7 +55,7 @@ sub _paged_request($self, $endpoint, $payload) { $page_payload->{untilId} = minstr(map { $_->{id} } $result->@* ); } - return (\@all_results, undef); + return \@all_results; } sub timeline($self,$options) { diff --git a/lib/Dakkar/NotesArchive.pm b/lib/Dakkar/NotesArchive.pm index 5571419..1fcf248 100644 --- a/lib/Dakkar/NotesArchive.pm +++ b/lib/Dakkar/NotesArchive.pm @@ -2,6 +2,7 @@ package Dakkar::NotesArchive; use v5.36; use Moo; use experimental 'builtin'; +use PerlX::Maybe; use Dakkar::Misskey; use Types::Standard qw(Str InstanceOf); use namespace::clean; @@ -27,7 +28,7 @@ sub _build_client($self) { sub timeline($self, $since_id) { return $self->client->timeline({ - sinceId => $since_id, + maybe sinceId => $since_id, includeMyRenotes => \1, includeLocalRenotes => \1, includeRenotedMyNotes => \0, diff --git a/lib/Dakkar/TweetArchive/Store.pm b/lib/Dakkar/TweetArchive/Store.pm index fc75475..f0068de 100644 --- a/lib/Dakkar/TweetArchive/Store.pm +++ b/lib/Dakkar/TweetArchive/Store.pm @@ -35,6 +35,7 @@ sub BUILD($self,@) { } my $dt_parser = DateTime::Format::Strptime->new(pattern => '%a %b %d %T %z %Y'); +my $dt_parser_iso = DateTime::Format::Strptime->new(pattern => '%Y-%m-%dT%H:%M:%S.%3NZ', time_zone=>'UTC'); my $dt_printer = DateTime::Format::Pg->new(); my $json_printer = JSON::MaybeXS->new( @@ -46,6 +47,8 @@ my $json_printer = JSON::MaybeXS->new( convert_blessed => 1, ); +# tweets + sub latest_tweet_id($self) { return $self->dbh->selectall_arrayref( q{SELECT MAX(id) FROM tweets}, @@ -71,14 +74,14 @@ SQL ); } -sub _store_people($self,$people) { +sub _store_twitter_people($self,$people) { my @ids; for my $person ($people->@*) { my $person_str = $json_printer->encode($person); push @ids, $self->dbh->selectall_arrayref(<<'SQL',{},$person_str)->[0][0]; -INSERT INTO people(data) VALUES (?) -ON CONFLICT (people_details(data)) DO UPDATE +INSERT INTO twitter_people(data) VALUES (?) +ON CONFLICT (twitter_people_details(data)) DO UPDATE SET data=EXCLUDED.data RETURNING id SQL @@ -88,22 +91,84 @@ SQL } -sub store_friends($self,$friends) { - my $ids = $self->_store_people($friends); +sub store_twitter_friends($self,$friends) { + my $ids = $self->_store_twitter_people($friends); $self->dbh->do(<<"SQL", {}, $ids); -INSERT INTO friends(users) VALUES(?) +INSERT INTO twitter_friends(users) VALUES(?) SQL } -sub store_followers($self,$followers) { - my $ids = $self->_store_people($followers); +sub store_twitter_followers($self,$followers) { + my $ids = $self->_store_twitter_people($followers); $self->dbh->do(<<"SQL", {}, $ids); -INSERT INTO followers(users) VALUES(?) +INSERT INTO twitter_followers(users) VALUES(?) +SQL +} + +# misskey notes + +sub latest_note_id($self) { + return $self->dbh->selectall_arrayref( + q{SELECT MAX(id) FROM notes}, + )->[0][0]; +} + +sub store_note($self,$note) { + # yes, the source most probably decoded this from a string, we + # have to serialise it again, so that PostgreSQL can parse it + # *again* + my $note_str = $json_printer->encode($note); + my $created_at = $dt_parser_iso->parse_datetime($note->{createdAt}); + + $self->dbh->do(<<'SQL', {}, +INSERT INTO notes(id,created_at,data) VALUES(?,?,?) + ON CONFLICT (id) DO UPDATE SET + created_at = EXCLUDED.created_at, + data = EXCLUDED.data +SQL + $note->{id}, + $dt_printer->format_datetime($created_at), + $note_str, + ); +} + +sub _store_misskey_people($self,$people) { + my @ids; + for my $person ($people->@*) { + my $person_str = $json_printer->encode($person); + push @ids, + $self->dbh->selectall_arrayref(<<'SQL',{},$person_str)->[0][0]; +INSERT INTO misskey_people(data) VALUES (?) +ON CONFLICT (misskey_people_details(data)) DO UPDATE + SET data=EXCLUDED.data +RETURNING id SQL + } + + return \@ids; } + +sub store_misskey_following($self,$friends) { + my $ids = $self->_store_twitter_people($friends); + + $self->dbh->do(<<"SQL", {}, $ids); +INSERT INTO misskey_following(users) VALUES(?) +SQL +} + +sub store_misskey_followers($self,$followers) { + my $ids = $self->_store_misskey_people($followers); + + $self->dbh->do(<<"SQL", {}, $ids); +INSERT INTO misskey_followers(users) VALUES(?) +SQL +} + +# schema + sub _schema_deploy($self,$next_version) { my $method_name = "_schema_deploy_${next_version}"; if (my $method = $self->can($method_name)) { @@ -359,4 +424,120 @@ $$; SQL } +sub _schema_deploy_6($self) { + my $dbh = $self->dbh; + + $dbh->do('ALTER TABLE people RENAME TO twitter_people'); + $dbh->do('ALTER TABLE friends RENAME TO twitter_friends'); + $dbh->do('ALTER TABLE followers RENAME TO twitter_followers'); + $dbh->do('ALTER INDEX idx_people RENAME TO idx_twitter_people'); + $dbh->do('ALTER FUNCTION people_details RENAME TO twitter_people_details'); + + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION note_text(IN t JSONB) RETURNS text +LANGUAGE sql +IMMUTABLE +RETURNS NULL ON NULL INPUT +PARALLEL SAFE +AS $$ +SELECT t->>'text' +$$; +SQL + + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION note_text_recursive(IN t JSONB) RETURNS text +LANGUAGE sql +IMMUTABLE +RETURNS NULL ON NULL INPUT +PARALLEL SAFE +AS $$ SELECT +note_text(t) || ' ' || +COALESCE( note_text_recursive(t->'renote'), '') +$$; +SQL + + # misskey doesn't currently store language, let's pretend it's all + # English + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION note_language(IN t JSONB) RETURNS regconfig +LANGUAGE sql +IMMUTABLE +RETURNS NULL ON NULL INPUT +PARALLEL SAFE +AS $$ + SELECT 'pg_catalog.english'::regconfig +$$; +SQL + + $dbh->do(<<'SQL'); +CREATE TABLE notes ( + id VARCHAR(255) PRIMARY KEY, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + data JSONB NOT NULL, + fts tsvector +); +SQL + + $dbh->do(<<'SQL'); +CREATE INDEX notes_fts ON notes USING GIN (fts); +SQL + + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION notes_fts_trigger() +RETURNS trigger +LANGUAGE plpgsql +AS $$ +begin + new.fts := to_tsvector(note_language(new.data),note_text_recursive(new.data)); + return new; +end +$$; +SQL + + $dbh->do(<<'SQL'); +CREATE TRIGGER notes_fts_update + BEFORE INSERT OR UPDATE + ON notes + FOR EACH ROW + EXECUTE PROCEDURE notes_fts_trigger(); +SQL + + $dbh->do(<<'SQL'); +CREATE TABLE misskey_people ( + id SERIAL PRIMARY KEY, + data JSONB NOT NULL +) +SQL + $dbh->do(<<'SQL'); +CREATE FUNCTION misskey_people_details(data jsonb) RETURNS text[] AS $$ + SELECT array[ + (data->>'id'), + (data->>'username'), + COALESCE(data->>'host',''), + (data->>'location'), + (data->>'description'), + (data->>'name'), + (data->>'avatarBlurhash'), + (data->>'bannerBlurhash') + ]; +$$ LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT +SQL + $dbh->do(<<'SQL'); +CREATE UNIQUE INDEX idx_miskkey_people ON misskey_people (misskey_people_details(data)) +SQL + + $dbh->do(<<'SQL'); +CREATE TABLE misskey_followers ( + taken_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP PRIMARY KEY, + users integer[] NOT NULL +) +SQL + $dbh->do(<<'SQL'); +CREATE TABLE misskey_following ( + taken_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP PRIMARY KEY, + users integer[] NOT NULL +) +SQL +} + 1; diff --git a/tweet-archive.pl b/tweet-archive.pl index a1b5fff..1a7232b 100644 --- a/tweet-archive.pl +++ b/tweet-archive.pl @@ -3,9 +3,11 @@ use v5.36; use strict; use warnings; use experimental 'try'; +use lib 'lib','local/lib/perl5'; use Path::Tiny; use JSON::MaybeXS; use Dakkar::TweetArchive; +use Dakkar::NotesArchive; use Dakkar::TweetArchive::Store; my $json_parser = JSON::MaybeXS->new( @@ -24,30 +26,60 @@ my $conf = $json_parser->decode( }, ); -my $client = Dakkar::TweetArchive->new($conf); -my $store = Dakkar::TweetArchive::Store->new($conf); +my $store = Dakkar::TweetArchive::Store->new($conf->{db}); -my $ua = $client->client->ua; -my @responses; -$ua->add_handler( response_done => sub { push @responses, $_[0]; return } ); +if ($conf->{twitter}) { + my $client = Dakkar::TweetArchive->new($conf->{twitter}); -try { - my $latest_id = $store->latest_tweet_id; + my @responses; + $client->client->ua->add_handler( response_done => sub { push @responses, $_[0]; return } ); - for my $tweet ($client->home_timeline($latest_id)->@*) { - $store->store_tweet($tweet); + try { + my $latest_id = $store->latest_tweet_id; + + for my $tweet ($client->home_timeline($latest_id)->@*) { + $store->store_tweet($tweet); + } + + $store->store_twitter_friends($client->friends); + $store->store_twitter_followers($client->followers); } + catch ($e) { + print "Twitter Fail: $e\n"; + for my $res (@responses) { + next if $res->is_success; + print $res->request->as_string; + print $res->as_string; + print "\n",'-' x 50,"\n\n"; + } + } + ; +}; + +if ($conf->{misskey}) { + my $client = Dakkar::NotesArchive->new($conf->{misskey}); - $store->store_friends($client->friends); - $store->store_followers($client->followers); - 1; -} -catch ($e) { - print "Fail: $e\n"; - for my $res (@responses) { - next if $res->is_success; - print $res->request->as_string; - print $res->as_string; - print "\n",'-' x 50,"\n\n"; + my @responses; + $client->client->ua->add_handler( response_done => sub { push @responses, $_[0]; return } ); + + try { + my $latest_id = $store->latest_note_id; + + for my $note ($client->timeline($latest_id)->@*) { + $store->store_note($note); + } + + $store->store_misskey_following($client->following); + $store->store_misskey_followers($client->followers); } + catch ($e) { + print "Misskey Fail: $e\n"; + for my $res (@responses) { + next if $res->is_success; + print $res->request->as_string; + print $res->as_string; + print "\n",'-' x 50,"\n\n"; + } + }; }; + -- cgit v1.2.3