From 4a79a0fcb2e86eb92392d8d46029f00e0438b1ec Mon Sep 17 00:00:00 2001 From: dakkar Date: Fri, 29 Jun 2018 15:41:48 +0100 Subject: index the tweets --- lib/Dakkar/TweetArchive/Store.pm | 88 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/lib/Dakkar/TweetArchive/Store.pm b/lib/Dakkar/TweetArchive/Store.pm index cc64a87..8470823 100644 --- a/lib/Dakkar/TweetArchive/Store.pm +++ b/lib/Dakkar/TweetArchive/Store.pm @@ -252,4 +252,92 @@ SQL $dbh->do(q{ALTER TABLE new_followers RENAME TO followers}); } +sub _schema_deploy_4($self) { + # let's get some sensible ordering when saying "order by id" + $dbh->do(<<'SQL'); +ALTER TABLE tweets ALTER COLUMN id SET DATA TYPE bigint USING id::bigint +SQL + + # get the text of a tweet + # + # we don't actually get extended_tweet, probably Twitter only + # gives it to "recognised" clients + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION tweet_text(IN t JSONB) RETURNS text +LANGUAGE sql +IMMUTABLE +RETURNS NULL ON NULL INPUT +PARALLEL SAFE +AS $$ SELECT +CASE t ? 'extended_tweet' + WHEN true THEN t->'extended_tweet'->>'text' + ELSE t->>'text' +END +$$; +SQL + + # get text, including RTs and quotes + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION tweet_text_recursive(IN t JSONB) RETURNS text +LANGUAGE sql +IMMUTABLE +RETURNS NULL ON NULL INPUT +PARALLEL SAFE +AS $$ SELECT +tweet_text(t) || ' ' || +COALESCE( tweet_text_recursive(t->'retweeted_status'), '') || ' ' || +COALESCE( tweet_text_recursive(t->'quoted_status'), '') +$$; +SQL + + # map a tweet to a full-text search language configuration + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION tweet_language(IN t JSONB) RETURNS regconfig +LANGUAGE sql +IMMUTABLE +RETURNS NULL ON NULL INPUT +PARALLEL SAFE +AS $$ SELECT CASE t->>'lang' +WHEN 'it' THEN 'pg_catalog.italian'::regconfig +WHEN 'fr' THEN 'pg_catalog.french'::regconfig +WHEN 'de' THEN 'pg_catalog.german'::regconfig +WHEN 'es' THEN 'pg_catalog.spanish'::regconfig +WHEN 'nl' THEN 'pg_catalog.dutch'::regconfig +ELSE 'pg_catalog.english'::regconfig +END +$$; +SQL + + # add the full-text search data + $dbh->do(<<'SQL'); +ALTER TABLE tweets ADD COLUMN fts tsvector; +SQL + $dbh->do(<<'SQL'); +UPDATE tweets SET fts = to_tsvector(tweet_language(data),tweet_text_recursive(data)); +SQL + $dbh->do(<<'SQL'); +CREATE INDEX tweets_fts ON tweets USING GIN (fts); +SQL + + # and make sure it stays up-to-date + $dbh->do(<<'SQL'); +CREATE OR REPLACE FUNCTION tweets_fts_trigger() +RETURNS trigger +LANGUAGE plpgsql +AS $$ +begin + new.fts := to_tsvector(tweet_language(new.data),tweet_text_recursive(new.data)); + return new; +end +$$; +SQL + $dbh->do(<<'SQL'); +CREATE TRIGGER tweets_fts_update + BEFORE INSERT OR UPDATE + ON tweets + FOR EACH ROW + EXECUTE PROCEDURE tweets_fts_trigger(); +SQL +} + 1; -- cgit v1.2.3