Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
956a614
altera concurrency para 1
pitangainnovare Aug 10, 2025
99788fa
altera versao para 1.13.0
pitangainnovare Aug 10, 2025
278597b
remove campo db_id de metadados exportados de article e de journal
pitangainnovare Aug 10, 2025
b35e66c
Cria choice para indicar que log gerou erro
pitangainnovare Aug 10, 2025
6531d87
corrige obtencao de campo pdfs
pitangainnovare Aug 10, 2025
5058488
melhora valoracao de campo de data
pitangainnovare Aug 10, 2025
1d06fb1
corrige wagtail para publication_year (e nao publication date)
pitangainnovare Aug 10, 2025
b1d2fea
seta index_name padrao para usage
pitangainnovare Aug 10, 2025
c4306f9
cria metodo para padronizar ano de publicacao
pitangainnovare Aug 10, 2025
f5761e2
cria field para contar quantos arquivos foram exportados
pitangainnovare Aug 10, 2025
6e34abd
cria campo para guardar resumo do processamento do arquivo
pitangainnovare Aug 10, 2025
1cec4cf
cria campo para guardar ultima linha processada
pitangainnovare Aug 10, 2025
ab025ae
remove solr de production yaml
pitangainnovare Aug 10, 2025
617fac7
flexibiliza tracker para nao criar erro na hora de instanciacao
pitangainnovare Aug 10, 2025
f1043a2
atualiza lib counter para 1.5.1
pitangainnovare Aug 10, 2025
14446cb
cria line command para povoar dados de artigos
pitangainnovare Aug 10, 2025
213d156
refatora utils para parse_utils, index_utils e file_utils
pitangainnovare Aug 10, 2025
0cb7351
remove todos os models metrics
pitangainnovare Aug 10, 2025
c44fd54
transforma test_utils em test_index_utils
pitangainnovare Aug 10, 2025
297b368
faz algumas melhorias de terminologia e propagacao de erros em log_ma…
pitangainnovare Aug 10, 2025
92f3ede
reconstroi modulo es para lidar com utilitários
pitangainnovare Aug 10, 2025
bd2622c
refatora tasks para usar melhor es e gravar pouco em DB
pitangainnovare Aug 10, 2025
9ea1ab4
atualiza wagtailhooks de log_manager com novos campos
pitangainnovare Aug 10, 2025
5fd7686
adiciona arquivos de migracoes
pitangainnovare Aug 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.12.1
1.13.0
File renamed without changes.
Empty file.
80 changes: 80 additions & 0 deletions article/management/commands/load_articles_by_year.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from django.core.management.base import BaseCommand

from article.tasks import task_load_article_from_opac, task_load_article_from_article_meta


class Command(BaseCommand):
help = 'Generate task requests for loading article data from Article Meta for each year from 1900 to 2025'

def add_arguments(self, parser):
parser.add_argument(
'--start-year',
type=int,
default=1990,
help='Start year (default: 1990)'
)
parser.add_argument(
'--end-year',
type=int,
default=2025,
help='End year (default: 2025)'
)
parser.add_argument(
'--collection',
type=str,
default='scl',
help='Collection code (default: scl)'
)
parser.add_argument(
'--task',
choices=['load_article_from_opac', 'load_article_from_article_meta'],
default='load_article_from_opac',
help='Task to execute (default: load_article_from_opac)',
)

def handle(self, *args, **options):
start_year = options['start_year']
end_year = options['end_year']
collection = options['collection']

self.stdout.write(
self.style.SUCCESS(
f'Generating task requests from {start_year} to {end_year} for collection: {collection}'
)
)

total_tasks = 0

for year in range(start_year, end_year + 1):
from_date = f'{year}-01-01'
until_date = f'{year}-12-31'

self.stdout.write(f'Queuing task for year {year}...')

# Queue the task for each year
if options['task'] == 'load_article_from_article_meta':
task_result = task_load_article_from_article_meta.delay(
from_date=from_date,
until_date=until_date,
collection=collection
)
else:
task_result = task_load_article_from_opac.delay(
from_date=from_date,
until_date=until_date,
collection=collection
)

total_tasks += 1

self.stdout.write(
self.style.SUCCESS(
f'✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})'
)
)

self.stdout.write(
self.style.SUCCESS(
f'\nCompleted! {total_tasks} tasks have been queued successfully.'
)
)
1 change: 0 additions & 1 deletion article/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def metadata(cls, collection=None):

for a in qs.iterator():
yield {
'id': a.id,
'collection': a.collection.acron3,
'default_lang': a.default_lang,
'files': a.files,
Expand Down
6 changes: 3 additions & 3 deletions article/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def task_load_article_from_article_meta(self, from_date=None, until_date=None, d
try:
article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code'))
if created or force_update:
article.files = obj.get('files') or {}
article.files = obj.get('pdfs') or {}
article.processing_date = obj.get('processing_date') or ''
article.publication_date = obj.get('publication_date') or ''
article.publication_year = obj.get('publication_year') or ''
Expand Down Expand Up @@ -121,8 +121,8 @@ def task_load_article_from_opac(self, collection='scl', from_date=None, until_da
article.pid_v3 = doc.get('pid_v3') or ''
if not created:
article.pid_v2 = doc.get('pid_v2') or ''
article.publication_date = doc.get('publication_date') or ''
article.default_lang = doc.get('default_language') or ''
article.publication_date = doc.get('publication_date') or article.publication_date or ''
article.default_lang = doc.get('default_language') or article.default_lang or ''

try:
article.publication_year = article.publication_date[:4]
Expand Down
2 changes: 1 addition & 1 deletion article/wagtail_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class ArticleSnippetViewSet(SnippetViewSet):
"pid_v3",
"pid_generic",
"files",
"publication_date",
"publication_year",
)
list_filter = (
"collection",
Expand Down
2 changes: 1 addition & 1 deletion compose/local/django/celery/worker/start
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ set -o errexit
set -o nounset


watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=4
watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1
2 changes: 1 addition & 1 deletion compose/production/django/celery/worker/start
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ set -o pipefail
set -o nounset


exec celery -A config.celery_app worker -l INFO --concurrency=4
exec celery -A config.celery_app worker -l INFO --concurrency=1
2 changes: 1 addition & 1 deletion config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@
# Elasticsearch
# ------------------------------------------------------------------------------
ES_URL = env("ES_URL", default="http://192.168.0.33:9200/")
ES_INDEX_NAME = env("ES_INDEX_NAME", default="usage-daily")
ES_INDEX_NAME = env("ES_INDEX_NAME", default="usage")
ES_API_KEY = env("ES_API_KEY", default="")
ES_BASIC_AUTH = env("ES_BASIC_AUTH", default=("elastic", "iHktg66E"))
ES_VERIFY_CERTS = env.bool("ES_VERIFY_CERTS", default=False)
62 changes: 62 additions & 0 deletions core/tests_standardizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,65 @@ def test_standardize_doi_is_valid_with_doi_prefix_and_https_prefix(self):
doi = 'doi:https://doi.org/10.1590/S0102-67202020000100001'
standardized = standardizer.standardize_doi(doi)
self.assertEqual(standardized, '10.1590/S0102-67202020000100001')


class TestStandardizeYearOfPublication(TestCase):
def test_standardize_year_of_publication_four_digit_year(self):
"""Test that a four-digit year is returned as-is"""
year = "2023"
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, "2023")

def test_standardize_year_of_publication_integer_year(self):
"""Test that an integer year is converted to string"""
year = 2023
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, "2023")

def test_standardize_year_of_publication_year_range(self):
"""Test that a year range returns the first year"""
year = "2020-2023"
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, "2020")

def test_standardize_year_of_publication_year_with_slash(self):
"""Test that a year with slash returns the first year"""
year = "2020/2023"
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, "2020")

def test_standardize_year_of_publication_year_with_extra_text(self):
"""Test that year with extra text extracts the year"""
year = "Published in 2023"
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, "")

def test_standardize_year_of_publication_invalid_year(self):
"""Test that invalid year returns None or empty string"""
year = "invalid"
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, '')

def test_standardize_year_of_publication_empty_string(self):
"""Test that empty string returns None or empty string"""
year = ""
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, '')

def test_standardize_year_of_publication_none_input(self):
"""Test that None input returns None"""
year = None
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, '')

def test_standardize_year_of_publication_two_digit_year(self):
"""Test that two-digit year is converted to four-digit year"""
year = "23"
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, '')

def test_standardize_year_of_publication_year_with_parentheses(self):
"""Test that year in parentheses is extracted"""
year = "(2023)"
result = standardizer.standardize_year_of_publication(year)
self.assertEqual(result, '')
27 changes: 27 additions & 0 deletions core/utils/standardizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,30 @@ def standardize_pid_generic(pid_generic):
return pid_generic_based_on_doi

return pid_generic.strip().upper()


def standardize_year_of_publication(year_of_publication):
"""
Standardizes a year of publication.

Parameters:
year_of_publication (str): The year of publication to be standardized.

Returns:
str: The standardized year of publication or an empty string if the input is not a valid year.
"""
if not year_of_publication:
return ''

# Truncate to 4 characters if longer
if isinstance(year_of_publication, str) and len(year_of_publication) > 4:
year_of_publication = year_of_publication[:4]

try:
year = int(year_of_publication)
if 1500 <= year <= 2100:
return str(year)
except ValueError:
pass

return ''
1 change: 0 additions & 1 deletion journal/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def metadata(cls, collection=None):
'scielo_issn', 'subject_areas', 'title', 'wos_subject_areas'
):
yield {
'id': journal.id,
'acronym': journal.acronym,
'collection': journal.collection.acron3,
'issns': set([v for v in journal.issns.values() if v]),
Expand Down
2 changes: 2 additions & 0 deletions log_manager/choices.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
LOG_FILE_STATUS_QUEUED = 'QUE'
LOG_FILE_STATUS_PARSING = 'PAR'
LOG_FILE_STATUS_PROCESSED = 'PRO'
LOG_FILE_STATUS_ERROR = 'ERR'
LOG_FILE_STATUS_INVALIDATED = 'INV'
LOG_FILE_STATUS_IGNORED = 'IGN'

Expand All @@ -13,6 +14,7 @@
(LOG_FILE_STATUS_QUEUED, _("Queued")),
(LOG_FILE_STATUS_PARSING, _("Parsing")),
(LOG_FILE_STATUS_PROCESSED, _("Processed")),
(LOG_FILE_STATUS_ERROR, _("Error")),
(LOG_FILE_STATUS_INVALIDATED, _("Invalidated")),
(LOG_FILE_STATUS_IGNORED, _("Ignored")),
]
Expand Down
19 changes: 19 additions & 0 deletions log_manager/migrations/0006_logfile_last_processed_line.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 5.0.7 on 2025-06-22 15:21

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("log_manager", "0005_alter_logfile_status_alter_logfiledate_date_and_more"),
]

operations = [
migrations.AddField(
model_name="logfile",
name="last_processed_line",
field=models.IntegerField(
blank=True, default=0, verbose_name="Last Processed Line"
),
),
]
19 changes: 19 additions & 0 deletions log_manager/migrations/0007_logfile_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 5.0.7 on 2025-06-22 17:30

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("log_manager", "0006_logfile_last_processed_line"),
]

operations = [
migrations.AddField(
model_name="logfile",
name="summary",
field=models.JSONField(
blank=True, default=dict, null=True, verbose_name="Summary"
),
),
]
29 changes: 29 additions & 0 deletions log_manager/migrations/0008_alter_logfile_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by Django 5.0.7 on 2025-08-06 19:01

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("log_manager", "0007_logfile_summary"),
]

operations = [
migrations.AlterField(
model_name="logfile",
name="status",
field=models.CharField(
choices=[
("CRE", "Created"),
("QUE", "Queued"),
("PAR", "Parsing"),
("PRO", "Processed"),
("ERR", "Error"),
("INV", "Invalidated"),
("IGN", "Ignored"),
],
max_length=3,
verbose_name="Status",
),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 5.0.7 on 2025-08-07 00:14

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("log_manager", "0008_alter_logfile_status"),
]

operations = [
migrations.AddField(
model_name="collectionlogfiledatecount",
name="exported_files_count",
field=models.SmallIntegerField(
default=0, verbose_name="Exported Files Count"
),
),
]
Loading