diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ae1b7d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Ignore files in the Python virtual environment +/env/ + +# A convenient place to store downloaded mbox files from production +# to facilitate testing, and automation. +/mboxes/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..08cc20d --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +# A handy target to reset the development environment back to a clean slate +# and run the development server. +# XXX: For now just use the single mbox file that was previously downloaded. +# Additional work in this area, for testing use cases, is needed. +dev-rebuild-and-run: + dropdb --if-exists archives + createdb archives + django/manage.py migrate + loader/load_message.py --list pgsql-hackers --mbox mboxes/pgsql-hackers.202504 >/dev/null + cd ./django && ./run_dev.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..f5ae54a --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# PG archives + +This application manages PostgreSQL mailing list archives. However, the search +feature is implemented in pgweb. + +## The Application + +This is a Django 4.2 application backed by PostgreSQL and running on Python 3.x. + +## Getting Started + +### Ubuntu instructions + +First, prepare your development environment by installing python3, postgresql-server-dev-X.Y, formail and libtidy (use `--no-install-recommends` to avoid installing postfix): + +```bash +sudo apt install python3 postgresql-server-dev-14 procmail libtidy5deb1 --no-install-recommends +``` + +Next, configure your local environment with virtualenv and install local dependencies. + +```bash +python3 -m venv env +source env/bin/activate +pip install -r dev_requirements.txt +``` + +Create a database for the application: + +```bash +createdb archives +cd django +./manage.py migrate +# Creates pgsql-hackers list with ID 1 if open. +``` + +Create config for the loader scripts: + +```bash +cp loader/archives.ini.sample loader/archives.ini +``` + +Load some emails from the actual PostgreSQL archives by downloading an mbox +file from and running the +following command. NOTE: it's totally fine if some of the emails will fail to +load. + +```bash +loader/load_message.py --list pgsql-hackers --mbox /path/to/downloaded/mbox/file +``` + +Then go to the `django` directory, that's where the actual web application is. + +```bash +cd django +``` + +Create a local settings file (feel free to edit it): + +```bash +cp archives/example_settings_local.py archives/settings_local.py +``` + +Finally, you're ready to start the web application: + +```bash +./run_dev.py +``` + +Or, download the April 2025 mbox file from the PostgreSQL archives and place it in /mboxes. +Then run: +```bash +make dev-rebuild-and-run +``` + +Then open to view your local mailing +list archives. diff --git a/dev_requirements.txt b/dev_requirements.txt new file mode 100644 index 0000000..3b878a3 --- /dev/null +++ b/dev_requirements.txt @@ -0,0 +1,2 @@ +-r requirements.txt +uwsgi diff --git a/django/archives/example_settings_local.py b/django/archives/example_settings_local.py new file mode 100644 index 0000000..6ff29fe --- /dev/null +++ b/django/archives/example_settings_local.py @@ -0,0 +1,21 @@ +# Enable more debugging information +DEBUG = True +# Prevent logging to try to send emails to postgresql.org admins. +# Use the default Django logging settings instead. +LOGGING = None + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.postgresql_psycopg2", + "NAME": "archives", + "USER": "postgres", + "PASSWORD": "postgres", + "HOST": "0.0.0.0", + } +} + +# Allow API access to all clients +PUBLIC_ARCHIVES = True +ALLOWED_HOSTS = ["*"] + +PGWEB_ADDRESS = 'http://localhost:8001' diff --git a/django/archives/mailarchives/api.py b/django/archives/mailarchives/api.py index a6b2536..135fc0c 100644 --- a/django/archives/mailarchives/api.py +++ b/django/archives/mailarchives/api.py @@ -1,13 +1,15 @@ from django.http import HttpResponse, HttpResponseForbidden from django.shortcuts import get_object_or_404 from django.conf import settings +from django.db import connection import ipaddress from .views import cache from .models import Message, List import json - +import requests +from django.http import JsonResponse def is_host_allowed(request): for ip_range in settings.API_CLIENTS: @@ -117,9 +119,174 @@ def thread(request, msgid): 'date': m.date.isoformat(), 'from': m.mailfrom, 'subj': m.subject, - 'atts': [{'id': a.id, 'name': a.filename} for a in m.attachment_set.all()], + 'atts': [{'id': a.id, 'name': a.filename, 'is_patch': a.is_patch, 'content_type': a.contenttype} + for a in m.attachment_set.extra(select={'is_patch': 'attachments.is_patch'}).all()], } for m in mlist], resp) if settings.PUBLIC_ARCHIVES: resp['xkey'] = 'pgat_{0}'.format(msg.threadid) return resp + +def threads_with_patches(request): + if not settings.PUBLIC_ARCHIVES: + return HttpResponseForbidden('No API access on private archives for now') + + with connection.cursor() as cursor: + cursor.execute("""-- Find threads with patches + select * + from ( + select distinct on (threadid) + pm.threadid, + pm.id, + pm._from, + pm.subject, + pm.messageid, + ma.patch_count, + tm.subject AS thread_subject, + pm.date AS patch_date, + tm.date AS thread_date, + tm.messageid AS thread_messageid + from messages AS pm --patch message + -- threadid is a shared value but not a foreign key to anything + -- in particular, it is not a self-join of messages + join lateral ( + select * + from messages as im + where im.threadid = pm.threadid + order by im.date asc + limit 1 + ) AS tm on true --thread message is first known message + join lateral ( + select count(*) as patch_count + from attachments + where pm.id = attachments.message and is_patch(attachments) + ) as ma on true + where pm.has_attachment and ma.patch_count > 0 and pm.hiddenstatus is null + order by pm.threadid, pm.date desc + ) as threads_with_patches + order by patch_date DESC + limit 10; + """) + rows = cursor.fetchall() + + # Convert the SQL result into thread_list + thread_list = [ + { + "thread_id": str(row[0]), + "message_id": row[1], + "file_count": row[5], + "file_version": None, + "commit_sha": None, + "patch_id": None, + "subject_line": row[3], + "thread_subject": row[6], + "sender": row[2], + "id": row[1], + "patch_date": row[7].strftime('%Y-%m-%d %H:%M:%S') if row[7] else None, + "thread_date": row[8].strftime('%Y-%m-%d %H:%M:%S') if row[8] else None, + "message_code": row[4], + "thread_code": row[9] + } + for row in rows + ] + + resp = HttpResponse(content_type='application/json') + json.dump(thread_list, resp) + + return resp + +def get_patch_data_as_json(threadid, messageid): + with connection.cursor() as cursor: + cursor.execute("""-- Find threads with patches + select + pm.threadid, + pm.id, + tm.messageid as thread_messageid, + mrm.mostrecent_messageid, + pm.messageid as patch_messageid, + ma.fileset, + pm._from as patch_from_author, + tm.date as thread_messagedate, + mrm.mostrecent_messagedate, + pm.date as patch_messagedate, + tm.subject as thread_subject_line, + mrm.most_recent_subject_line, + mrm.most_recent_from_author, + tm._from as thread_from_author + from messages AS pm --patch message + join lateral ( + select * + from messages as im + where im.threadid = pm.threadid + order by im.date asc + limit 1 + ) AS tm on true --thread message is first known message + join lateral ( + select + id as mostrecent_id, + messageid as mostrecent_messageid, + date as mostrecent_messagedate, + subject as most_recent_subject_line, + _from as most_recent_from_author + from messages + where threadid = pm.threadid + order by date desc limit 1 + ) as mrm on true + join lateral ( + select jsonb_agg( + jsonb_build_object( + 'attachment_id', a.id, + 'filename', a.filename, + 'content_type', a.contenttype, + 'is_patch', is_patch(a) + ) order by a.filename) as fileset + from attachments as a + where pm.id = a.message + ) as ma on true + where pm.id = %s; + """, + (messageid,)) + row = cursor.fetchone() + + # Convert the SQL result into patch_data + patch_data = { + "thread_id": row[0], + "message_id": row[1], + "thread_message_id": row[2], + "most_recent_message_id": row[3], + "patch_message_id": row[4], + "patch_from_author": row[6], + "fileset": json.loads(row[5]) if row[5] else [], + "thread_message_date": row[7].isoformat() if row[7] else None, + "most_recent_message_date": row[8].isoformat() if row[8] else None, + "patch_message_date": row[9].isoformat() if row[9] else None, + "thread_subject_line": row[10], + "most_recent_subject_line": row[11], + "most_recent_from_author": row[12], + "thread_from_author": row[13], + } + + return json.dumps(patch_data) + +def create_cfapp_patch(request): + if not settings.PUBLIC_ARCHIVES: + return HttpResponseForbidden('No API access on private archives for now') + + if request.method != 'POST': + return JsonResponse({'error': 'Invalid request method'}, status=405) + + body_string = request.body.decode("utf-8") + body_json = json.loads(body_string) + + try: + # Forward the request body to the external service + response = requests.post( + 'http://localhost:8007/api/test/cfapp/create_patch', + headers={'Content-Type': 'application/json'}, + data=get_patch_data_as_json(body_json["thread_id"], body_json["message_id"]), + ) + + # Return the response from the external service + return JsonResponse(response.json(), status=response.status_code) + except requests.RequestException as e: + return JsonResponse({'error': f'Failed to proxy request: {str(e)}'}, status=500) diff --git a/django/archives/mailarchives/migrations/0005_sync_with_loader.py b/django/archives/mailarchives/migrations/0005_sync_with_loader.py new file mode 100644 index 0000000..c12bb52 --- /dev/null +++ b/django/archives/mailarchives/migrations/0005_sync_with_loader.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.18 on 2019-06-19 19:02 +from __future__ import unicode_literals + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0004_resend_rate_limit'), + ] + + operations = [ + migrations.RunSQL( + """ +ALTER TABLE messages ADD COLUMN rawtxt bytea NOT NULL; +ALTER TABLE messages ADD COLUMN fti tsvector NOT NULL; +ALTER TABLE attachments ADD COLUMN attachment bytea NOT NULL; +CREATE TABLE loaderrors( + id SERIAL NOT NULL PRIMARY KEY, + listid int NOT NULL, + dat timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, + msgid text NOT NULL, + srctype text NOT NULL, + src text NOT NULL, + err text NOT NULL +); +CREATE SEQUENCE threadid_seq; +CREATE TABLE list_months( + listid int NOT NULL REFERENCES lists(listid), + year int NOT NULL, + month int NOT NULL, + CONSTRAINT list_months_pk PRIMARY KEY (listid, year, month) +); +CREATE TABLE list_threads( + threadid int NOT NULL, /* comes from threadid_seq */ + listid int NOT NULL REFERENCES lists(listid), + CONSTRAINT pg_list_threads PRIMARY KEY (threadid, listid) +); +CREATE INDEX list_threads_listid_idx ON list_threads(listid); +CREATE TABLE unresolved_messages( + message int NOT NULL REFERENCES messages, + priority int NOT NULL, + msgid text NOT NULL, + CONSTRAINT unresolved_messages_pkey PRIMARY KEY (message, priority) +); +CREATE UNIQUE INDEX idx_unresolved_msgid_message ON unresolved_messages(msgid, message); + +/* A couple of convenience views that exclude the content fields. */ +CREATE VIEW messages_meta AS + SELECT + id, + parentid, + threadid, + _from, + _to, + cc, + subject, + date, + has_attachment, + hiddenstatus, + messageid + FROM messages; + +CREATE VIEW attachments_meta AS + SELECT + id, + message, + filename, + contenttype + FROM attachments; + +INSERT INTO listgroups (groupid, groupname, sortkey) VALUES (1, 'Developer lists', 1) + ON CONFLICT (groupid) DO NOTHING; + +INSERT INTO lists (listid, listname, shortdesc, description, active, groupid, subscriber_access) + VALUES (1, 'pgsql-hackers', 'pgsql-hackers', -- implicit concatentation below + 'The PostgreSQL developers team lives here. ' + 'Discussion of current development issues, problems and bugs, and proposed new features. ' + 'If your question cannot be answered by people in the other lists, ' + 'and it is likely that only a developer will know the answer, you may re-post your question in this list. ' + 'You must try elsewhere first!', True, 1, True) + ON CONFLICT (listid) DO NOTHING; + +CREATE TEXT SEARCH CONFIGURATION pg (COPY=pg_catalog.english); + +/* +CREATE TEXT SEARCH DICTIONARY english_ispell ( + TEMPLATE = ispell, + DictFile = english, + AffFile = english, + StopWords = english +); +CREATE TEXT SEARCH DICTIONARY pg_dict ( + TEMPLATE = synonym, + SYNONYMS = pg_dict +); +CREATE TEXT SEARCH DICTIONARY pg_stop ( + TEMPLATE = simple, + StopWords = pg_dict +); +*/ +ALTER TEXT SEARCH CONFIGURATION pg + ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, + word, hword, hword_part + WITH english_stem; + +ALTER TEXT SEARCH CONFIGURATION pg + DROP MAPPING FOR email, url, url_path, sfloat, float; + +CREATE FUNCTION messages_fti_trigger_func() RETURNS trigger AS $$ +BEGIN + NEW.fti = setweight(to_tsvector('public.pg', coalesce(new.subject, '')), 'A') || + setweight(to_tsvector('public.pg', coalesce(new.bodytxt, '')), 'D'); + RETURN NEW; +END +$$ LANGUAGE 'plpgsql'; + +CREATE TRIGGER messages_fti_trigger + BEFORE INSERT OR UPDATE OF subject, bodytxt ON messages + FOR EACH ROW EXECUTE PROCEDURE messages_fti_trigger_func(); + +CREATE INDEX messages_fti_idx ON messages USING gin(fti); + + """, + ), + + ] diff --git a/django/archives/mailarchives/migrations/0006_alter_message_parentid.py b/django/archives/mailarchives/migrations/0006_alter_message_parentid.py new file mode 100644 index 0000000..16d56bc --- /dev/null +++ b/django/archives/mailarchives/migrations/0006_alter_message_parentid.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.20 on 2025-04-28 18:37 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0005_sync_with_loader'), + ] + + operations = [ + migrations.AlterField( + model_name='message', + name='parentid', + field=models.IntegerField(blank=True, null=True), + ), + ] diff --git a/django/archives/mailarchives/migrations/0007_add_is_patch_function.py b/django/archives/mailarchives/migrations/0007_add_is_patch_function.py new file mode 100644 index 0000000..1420926 --- /dev/null +++ b/django/archives/mailarchives/migrations/0007_add_is_patch_function.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.18 on 2019-06-19 19:02 +from __future__ import unicode_literals + +from django.db import migrations + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0006_alter_message_parentid'), + ] + + operations = [ + migrations.RunSQL( + """ +CREATE FUNCTION is_patch(att attachments) RETURNS boolean LANGUAGE sql IMMUTABLE STRICT + RETURN (att).filename ~ '\.(diff|diff\.gz|patch|patch\.gz|tar\.gz|tgz|tar\.bz2|zip)$'; +; + """, + reverse_sql=""" +DROP FUNCTION is_patch(att attachments); + """, + ), + ] diff --git a/django/archives/mailarchives/models.py b/django/archives/mailarchives/models.py index 44c4469..c0ff6ea 100644 --- a/django/archives/mailarchives/models.py +++ b/django/archives/mailarchives/models.py @@ -25,7 +25,7 @@ class Message(models.Model): messageid = models.TextField(null=False) bodytxt = models.TextField(null=False) # rawtxt is a bytea field, which django doesn't support (easily) - parentid = models.IntegerField(null=False, blank=False) + parentid = models.IntegerField(null=True, blank=True) has_attachment = models.BooleanField(null=False, default=False) hiddenstatus = models.IntegerField(null=True) # fti is a tsvector field, which django doesn't support (easily) @@ -74,6 +74,9 @@ class ListGroup(models.Model): groupname = models.CharField(max_length=200, null=False, blank=False) sortkey = models.IntegerField(null=False) + def __str__(self): + return self.groupname + class Meta: db_table = 'listgroups' @@ -93,6 +96,9 @@ def maybe_shortdesc(self): return self.shortdesc return self.listname + def __str__(self): + return self.listname + class Meta: db_table = 'lists' diff --git a/django/archives/mailarchives/templates/advancedsearch.html b/django/archives/mailarchives/templates/advancedsearch.html new file mode 100644 index 0000000..7f9d9b1 --- /dev/null +++ b/django/archives/mailarchives/templates/advancedsearch.html @@ -0,0 +1,81 @@ +{%extends "page.html"%} +{%block title%}PostgreSQL Mailing Lists Search{%endblock%} + +{%block contents%} +PostgreSQL Mailing Lists Search + + + + + Search term: + + + + + + + + + + Examples 2025-04 (early) + CANqtF-pgb87qQr94rMeWKsAa2JGBw9Ygo_wH2bzvVZpi4Mnaig@mail.gmail.com + + + List: + + -- All lists + {%for l in lists %} + {%ifchanged l.group%} + -- {{l.group}} + {%endifchanged%} + {{l}} + {%endfor%} + + + + Date: + {%for d in dates%} + {{d.text}}{%endfor%} + + + + Sort By: + {%for s in sortoptions%} + {{s.text}}{%endfor%} + + + + + + Limit to first message per thread + + + + + + +{%if search_error %} +{{search_error}} +{%elif query == '' %} +Click the magnifying glass on the query box to search. +{%else%} + + {%if hitcount == 0 %} + Your search for {{query}} returned no hits. + {%else%} + Results {{firsthit}}-{{lasthit}} of {%if hitcount == 1000%}more than 1000{%else%}{{hitcount}}{%endif%}. + {%if pagelinks %}Result pages: {{pagelinks|safe}}{%endif%} + {%for hit in hits %} + + {{forloop.counter0|add:firsthit}}. {{hit.subject}} [{{hit.rank|floatformat:2}}] + From {{hit.author}} on {{hit.date}}. + Thread# {{hit.threadid}} Rank: {{hit.thread_rank}} + {{hit.abstract_found}} {{hit.abstract_after}} + {{PGWEB_ADDRESS}}/message-id/{{hit.messageid}} + + {%endfor%} + {%if pagelinks %}Result pages: {{pagelinks|safe}}{%endif%} + {%endif%} +{%endif%} + +{%endblock%} diff --git a/django/archives/mailarchives/templates/threads.html b/django/archives/mailarchives/templates/threads.html new file mode 100644 index 0000000..9def39c --- /dev/null +++ b/django/archives/mailarchives/templates/threads.html @@ -0,0 +1,117 @@ +{%extends "page.html"%} +{%block title%}PostgreSQL Mailing List Archives{%endblock%} +{%load pgfilters%} +{%block contents%} +Thread Viewer + +Load Threads with Patches + + + + + Subject Line + Sender + Thread Subject + File Count + File Version + Commit SHA + Patch ID + Patch Date + Thread Date + ID + Action + + + + + + + + + +{%endblock%} diff --git a/django/archives/mailarchives/views.py b/django/archives/mailarchives/views.py index 69172bd..647f561 100644 --- a/django/archives/mailarchives/views.py +++ b/django/archives/mailarchives/views.py @@ -706,67 +706,32 @@ def resend_complete(request, messageid): }) -@csrf_exempt -def search(request): - if not settings.PUBLIC_ARCHIVES: - # We don't support searching of non-public archives at all at this point. - # XXX: room for future improvement - return HttpResponseForbidden('Not public archives') - # Only certain hosts are allowed to call the search API - allowed = False - for ip_range in settings.SEARCH_CLIENTS: - if ipaddress.ip_address(request.META['REMOTE_ADDR']) in ipaddress.ip_network(ip_range): - allowed = True - break - if not allowed: - return HttpResponseForbidden('Invalid host') +def perform_search(query, datecode, sortcode, oneperthread=False, listid=None, listnames=None, streamer=None): + if not query and not streamer: + return [] - curs = connection.cursor() + if not query and streamer: + return False - # Perform a search of the archives and return a JSON document. - # Expects the following (optional) POST parameters: - # q = query to search for - # ln = comma separate list of listnames to search in - # d = number of days back to search for, or -1 (or not specified) - # to search the full archives - # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date] - if not request.method == 'POST': - raise Http404('I only respond to POST') + if listid and listnames: + raise Exception("Cannot specify both listid and listname") - if 'q' not in request.POST: - raise Http404('No search query specified') - query = request.POST['q'] + curs = connection.cursor() - if 'ln' in request.POST: + lists = None + if listnames: try: curs.execute("SELECT listid FROM lists WHERE listname=ANY(%(names)s)", { - 'names': request.POST['ln'].split(','), + 'names': listnames.split(','), }) lists = [x for x, in curs.fetchall()] except Exception: # If failing to parse list of lists, just search all lists = None - else: - lists = None - if 'd' in request.POST: - days = int(request.POST['d']) - if days < 1 or days > 365: - firstdate = None - else: - firstdate = datetime.now() - timedelta(days=days) - else: - firstdate = None - - if 's' in request.POST: - list_sort = request.POST['s'] - if list_sort not in ('d', 'r', 'i'): - list_stort = 'r' - else: - list_sort = 'r' - - # Ok, we have all we need to do the search + if listid: + lists = [listid] if query.find('@') > 0: cleaned_id = query.strip().removeprefix('<').removesuffix('>') @@ -778,15 +743,59 @@ def search(request): }) a = curs.fetchall() if len(a) == 1: - # Yup, this was a messageid - resp = HttpResponse(content_type='application/json') - - json.dump({'messageidmatch': 1}, resp) - return resp + if streamer: + json.dump({'messageidmatch': 1}, streamer) + else: + return [{'messageidmatch': cleaned_id}] # If not found, fall through to a regular search + firstdate = None + if datecode: + days = int(datecode) + if days >= 1 and days <= 365: + firstdate = datetime.now() - timedelta(days=days) + + list_sort = 'i' + if sortcode: + if sortcode in ('d', 'r', 'i'): + list_sort = sortcode + curs.execute("SET gin_fuzzy_search_limit=10000") - qstr = "SELECT messageid, date, subject, _from, ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), ts_headline(bodytxt, plainto_tsquery('public.pg', %(q)s),'StartSel=\"[[[[[[\",StopSel=\"]]]]]]\"') FROM messages m WHERE fti @@ plainto_tsquery('public.pg', %(q)s)" + + qstr = """-- Search for messages matching query -- +SELECT * FROM ( +SELECT + *, +""" + qstr += " row_number() over (partition by threadid order by " + if list_sort == 'r': + qstr += "ts_rank_cd DESC" + elif list_sort == 'd': + qstr += "date DESC" + else: + qstr += "date ASC" + qstr += ") AS thread_rank" + + qstr +=""" +FROM +( + SELECT + messageid, + threadid, + date, + subject, + _from, + ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), + ts_headline( + bodytxt, + plainto_tsquery('public.pg', %(q)s), + 'StartSel=\"[[[[[[\", + StopSel=\"]]]]]]\"' + ) + FROM messages m + WHERE fti @@ plainto_tsquery('public.pg', %(q)s) +""" + params = { 'q': query, } @@ -796,18 +805,22 @@ def search(request): if firstdate: qstr += " AND m.date > %(date)s" params['date'] = firstdate + + qstr += ") AS finding ) AS ranking" + + if oneperthread: + qstr += " WHERE thread_rank = 1" + if list_sort == 'r': - qstr += " ORDER BY ts_rank_cd(fti, plainto_tsquery(%(q)s)) DESC LIMIT 1000" + qstr += " ORDER BY ts_rank_cd DESC LIMIT 1000" elif list_sort == 'd': qstr += " ORDER BY date DESC LIMIT 1000" else: qstr += " ORDER BY date ASC LIMIT 1000" curs.execute(qstr, params) - - resp = HttpResponse(content_type='application/json') - - json.dump([ + if streamer: + json.dump([ { 'm': messageid, 'd': date.isoformat(), @@ -815,10 +828,141 @@ def search(request): 'f': mailfrom, 'r': rank, 'a': abstract.replace("[[[[[[", "").replace("]]]]]]", ""), - } for messageid, date, subject, mailfrom, rank, abstract in curs.fetchall()], - resp) + } for messageid, threadid, date, subject, mailfrom, rank, abstract, thread_rank in curs.fetchall()], + streamer) + return True + else: + return [ + { + 'm': messageid, + 't': threadid, + 'tr': thread_rank, + 'd': date.isoformat(), + 's': subject, + 'f': mailfrom, + 'r': rank, + 'a': abstract.replace("[[[[[[", "").replace("]]]]]]", ""), + 'a_found': abstract[abstract.find("[[[[[[") + 6:abstract.find("]]]]]]")], + 'a_after': abstract.replace(abstract[abstract.find("[[[[[["):abstract.find("]]]]]]") + 6], "").replace("[[[[[[", "").replace("]]]]]]", ""), + } for messageid, threadid, date, subject, mailfrom, rank, abstract, thread_rank in curs.fetchall()] + + +def advanced_search(request): + """ + 'pagelinks': " ".join( + generate_pagelinks(pagenum, + (totalhits - 1) // hitsperpage + 1, + querystr)), + """ + queryval = request.GET.get('q', None) + sortval = request.GET.get('s', 'd') + dateval = request.GET.get('d', '-1') + oneperthread = request.GET.get('r', '0') + + listid = 1 + + hits = perform_search(queryval, dateval, sortval, oneperthread=='1', listid=listid) + + totalhits = len(hits) + + if totalhits == 1: + # might be a messageid match + if 'messageidmatch' in hits[0]: + return HttpResponseRedirect('/message-id/%s' % hits[0]['messageidmatch']) + + firsthit = 1 + hitsperpage = 20 + + sortoptions = ( + {'val': 'r', 'text': 'Rank', 'selected': request.GET.get('s', '') not in ('d', 'i')}, + {'val': 'd', 'text': 'Date', 'selected': request.GET.get('s', '') == 'd'}, + {'val': 'i', 'text': 'Reverse date', 'selected': request.GET.get('s', '') == 'i'}, + ) + + dateoptions = ( + {'val': -1, 'text': 'anytime'}, + {'val': 1, 'text': 'within last day'}, + {'val': 7, 'text': 'within last week'}, + {'val': 31, 'text': 'within last month'}, + {'val': 186, 'text': 'within last 6 months'}, + {'val': 365, 'text': 'within last year'}, + ) + + (groups, listgroupid) = get_all_groups_and_lists(request) + return render_nav(NavContext(request, all_groups=groups), 'advancedsearch.html', { + 'groups': [{'groupname': g['groupname'], 'lists': g['lists']} for g in groups], + 'hitcount': totalhits, + 'firsthit': firsthit, + 'lasthit': min(totalhits, firsthit + hitsperpage - 1), + 'query': request.GET['q'] if 'q' in request.GET else '', + 'archives_root': '/', #settings.ARCHIVES_FRONT_ADDRESS, + 'pagelinks': '', + 'hits': [{ + 'date': h['d'], + 'subject': h['s'], + 'author': h['f'], + 'messageid': h['m'], + 'threadid': h['t'], + 'thread_rank': h['tr'], + 'abstract': h['a'], + 'abstract_found': h['a_found'], + 'abstract_after': h['a_after'], + 'rank': h['r'], + } for h in hits[firsthit - 1:firsthit + hitsperpage - 1]], + 'sortoptions': sortoptions, + 'lists': List.objects.all().order_by("group__sortkey"), + 'listid': listid, + 'dates': dateoptions, + 'dateval': dateval, + 'oneperthread': oneperthread, + }) + +@csrf_exempt +def search(request): + if not settings.PUBLIC_ARCHIVES: + # We don't support searching of non-public archives at all at this point. + # XXX: room for future improvement + return HttpResponseForbidden('Not public archives') + + # Only certain hosts are allowed to call the search API + allowed = False + for ip_range in settings.SEARCH_CLIENTS: + if ipaddress.ip_address(request.META['REMOTE_ADDR']) in ipaddress.ip_network(ip_range): + allowed = True + break + if not allowed: + return HttpResponseForbidden('Invalid host') + + # Perform a search of the archives and return a JSON document. + # Expects the following (optional) POST parameters: + # q = query to search for + # ln = comma separate list of listnames to search in + # d = number of days back to search for, or -1 (or not specified) + # to search the full archives + # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date] + if not request.method == 'POST': + raise Http404('I only respond to POST') + + if 'q' not in request.POST: + raise Http404('No search query specified') + query = request.POST['q'] + ln = request.POST['ln'] if 'ln' in request.POST else None + + dateval = request.POST.get('d', '-1') + sortval = request.POST.get('s', 'i') + + resp = HttpResponse(content_type='application/json') + perform_search(query, dateval, sortval, listname=ln, streamer=resp) return resp +def threads(request): + return render( + request, + 'threads.html', + { + 'request': request, + }) + @cache(seconds=10) def web_sync_timestamp(request): diff --git a/django/archives/urls.py b/django/archives/urls.py index 993c9f9..e10fb00 100644 --- a/django/archives/urls.py +++ b/django/archives/urls.py @@ -9,6 +9,10 @@ import archives.mailarchives.api urlpatterns = [ + re_path(r'^threads/', archives.mailarchives.views.threads), + re_path(r'^api/threads_with_patches/', archives.mailarchives.api.threads_with_patches), + re_path(r'^api/create_cfapp_patch$', archives.mailarchives.api.create_cfapp_patch), + # Examples: # re_path(r'^$', 'archives.views.home', name='home), # re_path(r'^archives/', include('archives.foo.urls')), @@ -44,6 +48,7 @@ # Search re_path(r'^archives-search/', archives.mailarchives.views.search), + re_path(r'^search/$', archives.mailarchives.views.advanced_search), # Date etc indexes re_path(r'^list/([\w-]+)/$', archives.mailarchives.views.monthlist), diff --git a/django/run_dev.py b/django/run_dev.py new file mode 100755 index 0000000..268cd83 --- /dev/null +++ b/django/run_dev.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +from importlib.machinery import PathFinder +import subprocess +import sys + +django_path = PathFinder().find_spec("django").submodule_search_locations[0] + +django_admin_path = django_path + "/contrib/admin/static/admin" + +if len(sys.argv) > 1: + ini_file = sys.argv[1] +else: + ini_file = "uwsgi_dev.ini" + +subprocess.run( + [ + "uwsgi", + "--static-map", + f"/static/admin={django_path}/contrib/admin/static/admin", + ini_file, + ] +) diff --git a/django/uwsgi_dev.ini b/django/uwsgi_dev.ini new file mode 100644 index 0000000..9ab26b2 --- /dev/null +++ b/django/uwsgi_dev.ini @@ -0,0 +1,10 @@ +[uwsgi] +threads=1 +env=DJANGO_SETTINGS_MODULE=archives.settings +module=archives.wsgi:application +py-autoreload=1 +touch-reload = archives/settings.py +touch-reload = archives/settings_local.py +touch-reload = uwsgi_dev.ini +http=127.0.0.1:8001 +static-map=/media-archives=media diff --git a/loader/archives.ini.sample b/loader/archives.ini.sample index a146f69..1c3c6e2 100644 --- a/loader/archives.ini.sample +++ b/loader/archives.ini.sample @@ -2,16 +2,16 @@ connstr=dbname=archives [varnish] -purgeurl=https://wrigleys.postgresql.org/api/varnish/purge/ +#purgeurl=https://wrigleys.postgresql.org/api/varnish/purge/ [smtp] -server=localhost:9911 -heloname=localhost -resender=noreply@example.com +#server=localhost:9911 +#heloname=localhost +#resender=noreply@example.com [pglister] # synchronize subscribers between pgarchives and pglister -subscribers=0 -root=/path/to/pglister -myname=pgarchives -apikey=CHANGEME +#subscribers=0 +#root=/path/to/pglister +#myname=pgarchives +#apikey=CHANGEME diff --git a/loader/lib/parser.py b/loader/lib/parser.py index 027ed53..e81b193 100644 --- a/loader/lib/parser.py +++ b/loader/lib/parser.py @@ -62,7 +62,7 @@ def _extract_date(d): self.date = lowdate # Else we're going to go with what we found self.bodytxt = self.get_body() - self.attachments = [] + self.attachments = [] # (filename, contenttype, payload) self.get_attachments() if len(self.attachments) > 0: log.status("Found %s attachments" % len(self.attachments)) diff --git a/loader/sql/schema.sql b/loader/sql/schema.sql deleted file mode 100644 index be735d9..0000000 --- a/loader/sql/schema.sql +++ /dev/null @@ -1,160 +0,0 @@ -\set ON_ERROR_STOP on - -BEGIN; - -CREATE TABLE messages ( - id SERIAL NOT NULL PRIMARY KEY, - parentid int REFERENCES messages, - threadid int NOT NULL, - _from text NOT NULL, - _to text NOT NULL, - cc text NOT NULL, - subject text NOT NULL, - date timestamptz NOT NULL, - loaddate timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, - has_attachment boolean NOT NULL, - hiddenstatus int NULL, - messageid text NOT NULL, - bodytxt text NOT NULL, - rawtxt bytea NOT NULL, - fti tsvector NOT NULL -); -CREATE INDEX idx_messages_threadid ON messages(threadid); -CREATE UNIQUE INDEX idx_messages_msgid ON messages(messageid); -CREATE INDEX idx_messages_date ON messages(date); -CREATE INDEX idx_messages_parentid ON messages(parentid); - -CREATE TABLE message_hide_reasons ( - message int NOT NULL PRIMARY KEY REFERENCES messages, - dt timestamptz, - reason text, - by text -); - -CREATE SEQUENCE threadid_seq; - -CREATE TABLE unresolved_messages( - message int NOT NULL REFERENCES messages, - priority int NOT NULL, - msgid text NOT NULL, - CONSTRAINT unresolved_messages_pkey PRIMARY KEY (message, priority) -); - -CREATE UNIQUE INDEX idx_unresolved_msgid_message ON unresolved_messages(msgid, message); - -CREATE TABLE listgroups( - groupid int NOT NULL PRIMARY KEY, - groupname text NOT NULL UNIQUE, - sortkey int NOT NULL -); - -CREATE TABLE lists( - listid int NOT NULL PRIMARY KEY, - listname text NOT NULL UNIQUE, - shortdesc text NOT NULL, - description text NOT NULL, - active boolean NOT NULL, - groupid int NOT NULL REFERENCES listgroups(groupid) -); - -CREATE TABLE list_months( - listid int NOT NULL REFERENCES lists(listid), - year int NOT NULL, - month int NOT NULL, - CONSTRAINT list_months_pk PRIMARY KEY (listid, year, month) -); - -CREATE TABLE list_threads( - threadid int NOT NULL, /* comes from threadid_seq */ - listid int NOT NULL REFERENCES lists(listid), - CONSTRAINT pg_list_threads PRIMARY KEY (threadid, listid) -); -CREATE INDEX list_threads_listid_idx ON list_threads(listid); - -CREATE TABLE attachments( - id serial not null primary key, - message int not null references messages(id), - filename text not null, - contenttype text not null, - attachment bytea not null -); -CREATE INDEX idx_attachments_msg ON attachments(message); - -CREATE TABLE loaderrors( - id SERIAL NOT NULL PRIMARY KEY, - listid int NOT NULL, - dat timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, - msgid text NOT NULL, - srctype text NOT NULL, - src text NOT NULL, - err text NOT NULL -); - -/* textsearch configs */ -CREATE TEXT SEARCH CONFIGURATION pg (PARSER=tsparser); - -CREATE TEXT SEARCH DICTIONARY english_ispell ( - TEMPLATE = ispell, - DictFile = en_us, - AffFile = en_us, - StopWords = english -); -CREATE TEXT SEARCH DICTIONARY pg_dict ( - TEMPLATE = synonym, - SYNONYMS = pg_dict -); -CREATE TEXT SEARCH DICTIONARY pg_stop ( - TEMPLATE = simple, - StopWords = pg_dict -); -ALTER TEXT SEARCH CONFIGURATION pg - ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, - word, hword, hword_part - WITH pg_stop, pg_dict, english_ispell, english_stem; -ALTER TEXT SEARCH CONFIGURATION pg - DROP MAPPING FOR email, url, url_path, sfloat, float; - -CREATE FUNCTION messages_fti_trigger_func() RETURNS trigger AS $$ -BEGIN - NEW.fti = setweight(to_tsvector('public.pg', coalesce(new.subject, '')), 'A') || - setweight(to_tsvector('public.pg', coalesce(new.bodytxt, '')), 'D'); - RETURN NEW; -END -$$ LANGUAGE 'plpgsql'; - -CREATE TRIGGER messages_fti_trigger - BEFORE INSERT OR UPDATE OF subject, bodytxt ON messages - FOR EACH ROW EXECUTE PROCEDURE messages_fti_trigger_func(); -CREATE INDEX messages_fti_idx ON messages USING gin(fti); - -CREATE TABLE legacymap( - listid int not null, - year int not null, - month int not null, - msgnum int not null, - msgid text not null, -CONSTRAINT legacymap_pk PRIMARY KEY (listid, year, month, msgnum) -); - -/* Simple API for hiding messages */ -CREATE OR REPLACE FUNCTION hide_message(msgid_txt text, reason_code integer, user_txt text, reason_txt text) - RETURNS integer AS -$BODY$ -DECLARE - returned_id integer; -BEGIN - UPDATE messages SET hiddenstatus = reason_code WHERE messageid = msgid_txt RETURNING id INTO returned_id; - - IF NOT FOUND THEN - RAISE EXCEPTION 'The specified message (%) could not be found.', msgid_txt; - END IF; - - INSERT INTO message_hide_reasons (message, dt, reason, by) VALUES (returned_id, now(), reason_txt, user_txt); - - RETURN returned_id; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -\echo Dont forget to commit! diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0064172 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +django>=4.2,<4.3 +psycopg2 +requests +pycryptodome +pycryptodomex +python-dateutil +pytidylib
Your search for {{query}} returned no hits.