diff --git a/.github/workflows/figshare-processing.yaml b/.github/workflows/figshare-processing.yaml index 53fad7e..67dd566 100644 --- a/.github/workflows/figshare-processing.yaml +++ b/.github/workflows/figshare-processing.yaml @@ -2,8 +2,17 @@ name: figshare-cache on: workflow_dispatch: + inputs: + use_author_cache: + description: 'Use cached author data (instead of refreshing)' + required: false + default: 'false' + type: choice + options: + - 'true' + - 'false' schedule: - - cron: "30 2 * * 2" + - cron: "30 */4 * * *" push: branches: - main @@ -20,13 +29,18 @@ jobs: fetch-depth: 1 - name: Use Cache in folder ./output - uses: actions/cache@v3 + id: cache-restore-output + uses: actions/cache/restore@v5 with: path: ./output - key: cache-files + key: cache-files-${{ github.run_id }} + restore-keys: | + cache-files- - name: Create output directory if it doesn't exist - run: mkdir -p output + run: | + mkdir -p output + find ./output - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." @@ -46,11 +60,30 @@ jobs: pip install -r requirements-frozen.txt - name: Run figshare exporter + env: + FIGSHARE_TOKEN: ${{ secrets.FIGSHARE_TOKEN }} run: | set -e cd ./output - python ../figshare.py --force-refresh + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then + echo "Running with --use-author-cache (manually triggered)" + python ../figshare.py --use-author-cache + else + echo "Running without cache (default behavior)" + python ../figshare.py + fi + - name: Save Cache from folder ./output + uses: actions/cache/save@v5 + if: always() + with: + path: ./output + key: ${{ steps.cache-restore-output.outputs.cache-primary-key || 'cache-files' }} + + - name: Generate publication statistics + run: | + cd ./output + python ../generate_stats.py --all-csv figshare_articles_all.csv --dedup-csv figshare_articles.csv >> $GITHUB_STEP_SUMMARY - name: Nexus Repo Publish bibtex if: ${{ github.event_name != 'pull_request' }} @@ -97,6 +130,6 @@ jobs: with: name: outputs path: | - ./output/*.csv + ./output/*.csv ./output/*.bib retention-days: 30 diff --git a/README.md b/README.md new file mode 100644 index 0000000..edf95d4 --- /dev/null +++ b/README.md @@ -0,0 +1,187 @@ +# LCAS EPrint Cache + +This repository automatically exports and caches publication data from Figshare for LCAS (Lincoln Centre for Autonomous Systems) researchers. + +## Overview + +The system: +- Retrieves publication metadata from Figshare repository +- Processes author information and generates BibTeX entries +- Exports data in CSV and BibTeX formats +- Publishes to Nexus repository for public access + +## Setup + +### Prerequisites + +- Python 3.10+ +- Figshare API token (required) + +### Configuration + +#### Figshare API Token + +This application requires a Figshare API token to function properly. To set up: + +1. **Create a Figshare account**: Visit [https://figshare.com](https://figshare.com) and create an account +2. **Generate an API token**: + - Log in to Figshare + - Go to Account Settings → Applications + - Create a new personal token + - Copy the token securely +3. **For local development**: Set the environment variable + ```bash + export FIGSHARE_TOKEN="your_token_here" + ``` +4. **For GitHub Actions**: Add the token as a repository secret named `FIGSHARE_TOKEN` + - Go to repository Settings → Secrets and variables → Actions + - Create a new secret named `FIGSHARE_TOKEN` + - Paste your Figshare API token + +**Note**: Without a valid API token, requests to the Figshare API will fail with 403 errors. + +### Installation + +```bash +# Install dependencies +pip install -r requirements-frozen.txt +``` + +## Usage + +### Command Line + +```bash +# Run with default authors list +python figshare.py + +# Run with specific authors +python figshare.py --authors "Marc Hanheide" "Tom Duckett" + +# Run with authors from file +python figshare.py --authors-file staff.json + +# Force refresh (ignore cache) +python figshare.py --force-refresh + +# Adjust rate limiting (default is 1 second delay between requests) +python figshare.py --rate-limit-delay 2.0 + +# Enable debug logging +python figshare.py --debug + +# Custom output filenames +python figshare.py --output my_articles.csv --output-all my_articles_all.csv +``` + +### Arguments + +- `-a, --authors`: List of author names to process +- `-f, --authors-file`: Path to file containing author names (one per line) +- `-s, --since`: Process only publications since this date (YYYY-MM-DD), default: 2021-01-01 +- `-o, --output`: Output CSV filename for deduplicated publications, default: figshare_articles.csv +- `-O, --output-all`: Output CSV filename for all publications (with duplicates), default: figshare_articles_all.csv +- `--force-refresh`: Force refresh data instead of loading from cache +- `--rate-limit-delay`: Delay in seconds between Figshare API requests, default: 1.0 +- `--debug`: Enable debug logging + +## Output Files + +The script generates several output files: + +- `lcas.bib`: Combined BibTeX file with all publications (deduplicated) +- `figshare_articles.csv`: CSV with deduplicated articles +- `figshare_articles_all.csv`: CSV with all articles (includes duplicates when multiple authors) +- `{author_name}.bib`: Individual BibTeX files per author +- `{author_name}.csv`: Individual CSV files per author +- `{author_name}.db`: Cached data per author (shelve database) + +## Cache Files + +The application uses several cache files to minimize API calls: + +- `figshare_cache.pkl`: Cached Figshare API responses +- `bibtext_cache`: Cached BibTeX entries from DOI lookups +- `shortdoi_cache`: Cached short DOI mappings +- `crossref_cache.db`: Cached Crossref API responses for DOI guessing + +## GitHub Actions Workflow + +The workflow runs automatically: +- Weekly on Tuesdays at 02:30 UTC (uses cache by default) +- On push to main branch (uses cache by default) +- On pull requests (uses cache by default) +- Can be manually triggered via workflow_dispatch with optional force refresh + +### Manual Workflow Trigger + +When manually triggering the workflow: +1. Go to Actions → figshare-cache workflow +2. Click "Run workflow" +3. Choose whether to force refresh: + - **false** (default): Uses cached data, faster and respects rate limits + - **true**: Ignores cache and fetches fresh data from Figshare API + +**Note**: Force refresh should only be used when you need to ensure the latest data, as it makes many API requests and takes longer to complete. + +### Workflow Steps + +1. Checkout repository +2. Restore cache +3. Install Python dependencies +4. Run Figshare exporter (with or without --force-refresh based on trigger) +5. Publish results to Nexus repository +6. Upload artifacts + +### Rate Limiting + +The script includes built-in rate limiting with a 1-second delay between API requests to avoid hitting Figshare API rate limits. This helps ensure reliable operation even with authenticated requests. + +## Troubleshooting + +### 403 Forbidden Errors + +If you encounter 403 errors when accessing the Figshare API: +1. Ensure the `FIGSHARE_TOKEN` environment variable is set +2. Verify the token is valid and hasn't expired +3. Check that the token has appropriate permissions (read access to public articles) + +For detailed information about the 403 error and resolution steps, see [FIGSHARE_API_RESEARCH.md](FIGSHARE_API_RESEARCH.md). + +### Empty Results + +If no articles are found: +- Check that author names match exactly as they appear in Figshare +- Verify the articles are in the Lincoln repository (https://repository.lincoln.ac.uk) +- Use `--debug` flag for detailed logging + +### JSON Decode Errors + +The application includes validation for JSON responses. If issues persist: +- Check your internet connection +- Verify Figshare API is accessible +- Review logs for specific error messages + +## Development + +### Running Tests + +```bash +# Run with a single test author +python figshare.py --authors "Marc Hanheide" --debug +``` + +### Code Structure + +- `figshare.py`: Main script with FigShare API client and processing logic +- `doi2bib`: Class for DOI to BibTeX conversion +- `FigShare`: Class for Figshare API interactions +- `Author`: Class for author-specific processing + +## License + +[Add license information here] + +## Contact + +For issues or questions, please open an issue in the GitHub repository. diff --git a/figshare.py b/figshare.py index 7725139..8ea2c14 100644 --- a/figshare.py +++ b/figshare.py @@ -5,12 +5,9 @@ from json import loads from pprint import pformat import pandas as pd -from functools import lru_cache, wraps -from datetime import datetime from logging import getLogger, basicConfig, INFO, DEBUG import os -from pickle import load, dump from flatten_dict import flatten @@ -27,6 +24,7 @@ import argparse from datetime import datetime from difflib import SequenceMatcher +import time basicConfig(level=INFO) @@ -120,29 +118,28 @@ def entries_to_str(self, entries): class FigShare: - def __init__(self, page_size=100): + def __init__(self, page_size=100, rate_limit_delay=1.0, max_retries=5): self.logger = getLogger("FigShare") self.token = os.getenv('FIGSHARE_TOKEN') + if self.token: + self.logger.info("Figshare API: Using authenticated requests") + else: + self.logger.warning("Figshare API: No authentication token found - using anonymous requests (may hit rate limits or receive 403 errors)") self.page_size = page_size + self.rate_limit_delay = rate_limit_delay + self.max_retries = max_retries self.base_url = "https://api.figshare.com/v2" + + if self.rate_limit_delay > 0: + self.logger.info(f"Rate limiting enabled: {self.rate_limit_delay} second delay between API requests") - # if cache file exist, load it - self.cache_file = "figshare_cache.pkl" - if os.path.exists(self.cache_file): - try: - with open(self.cache_file, "rb") as f: - self.__cache = load(f) - self.logger.debug(f"Loaded cache from {self.cache_file} with {len(self.__cache)} entries") - except Exception as e: - self.logger.warning(f"Failed to load cache: {e}") - self.__cache = {} - else: - self.logger.info(f"No cache file found at {self.cache_file}") - self.__cache = {} + # Use shelve for persistent caching + self.cache_file = "figshare_cache.db" - def save_cache(self): - with open(self.cache_file,"wb") as f: - dump(self.__cache, f) + with shelve.open(self.cache_file) as cache: + self.logger.info(f"Figshare API: Using cache file {self.cache_file} with {len(cache.keys())} entries") + for key in list(cache.keys()): + self.logger.debug(f" existing cache key: {key}") def __init_params(self): @@ -150,35 +147,102 @@ def __init_params(self): "page_size": self.page_size } - def __get(self, url, params=None, use_cache=True): - hash_key = f"GET{url}?{params}" - if hash_key in self.__cache and use_cache: - return self.__cache[hash_key] + def __handle_403_error(self, url, method="GET", response_text=""): + """Handle 403 Forbidden errors with helpful messages""" + if not self.token: + self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: " + f"Authentication required. Set FIGSHARE_TOKEN environment variable. " + f"See README.md for instructions.") else: + self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: " + f"Token may be invalid or lack permissions. " + f"Check token in Figshare account settings.") + if response_text: + self.logger.error(f"Response text: {response_text}") + + def __get(self, url, params=None, use_cache=True): + hash_key = f"GET{url}{'?' + str(params) if params else ''}" + + with shelve.open(self.cache_file) as cache: + if hash_key in cache and use_cache: + self.logger.info(f"Cache hit for GET {url}") + return cache[hash_key] + headers = { "Authorization": "token " + self.token } if self.token else {} - response = get(self.base_url + url, headers=headers, params=params) + + # Retry logic for 403 errors + for attempt in range(self.max_retries): + response = get(self.base_url + url, headers=headers, params=params) + + # Handle 403 Forbidden errors with retry logic + if response.status_code == 403: + if attempt < self.max_retries - 1: + # Exponential backoff: 1s, 2s, 4s, 8s, 16s + wait_time = 2 ** attempt + self.logger.warning(f"403 Forbidden for GET {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...") + time.sleep(wait_time) + continue + else: + # Final attempt failed, log error and return + self.__handle_403_error(url, "GET", response.text) + return {} + + # Success - break out of retry loop + break + + # Rate limiting: sleep after each API request + if self.rate_limit_delay > 0: + time.sleep(self.rate_limit_delay) + # Check if response is valid and contains JSON if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): result = response.json() - self.__cache[hash_key] = result - self.save_cache() + cache[hash_key] = result + self.logger.debug(f"Cached result for GET {url}") return result else: self.logger.warning(f"Received empty or invalid JSON response for GET {self.base_url + url} (status: {response.status_code})") return {} def __post(self, url, params=None, use_cache=True): - hash_key = f"POST{url}?{params}" - if hash_key in self.__cache and use_cache: - return self.__cache[hash_key] - else: + hash_key = f"POST{url}{'?' + str(params) if params else ''}" + + with shelve.open(self.cache_file) as cache: + if hash_key in cache and use_cache: + self.logger.debug(f"Cache hit for POST {url}") + return cache[hash_key] + headers = { "Authorization": "token " + self.token } if self.token else {} - response = post(self.base_url + url, headers=headers, json=params) + + # Retry logic for 403 errors + for attempt in range(self.max_retries): + response = post(self.base_url + url, headers=headers, json=params) + + # Handle 403 Forbidden errors with retry logic + if response.status_code == 403: + if attempt < self.max_retries - 1: + # Exponential backoff: 1s, 2s, 4s, 8s, 16s + wait_time = 2 ** attempt + self.logger.warning(f"403 Forbidden for POST {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...") + time.sleep(wait_time) + continue + else: + # Final attempt failed, log error and return + self.__handle_403_error(url, "POST", response.text) + return [] + + # Success - break out of retry loop + break + + # Rate limiting: sleep after each API request + if self.rate_limit_delay > 0: + time.sleep(self.rate_limit_delay) + # Check if response is valid and contains JSON if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): result = response.json() - self.__cache[hash_key] = result - self.save_cache() + cache[hash_key] = result + self.logger.debug(f"Cached result for POST {url}") return result else: self.logger.warning(f"Received empty or invalid JSON response for POST {self.base_url + url} (status: {response.status_code})") @@ -206,12 +270,12 @@ def get_article(self, article_id, use_cache=True): return self.__get(f"/articles/{article_id}", use_cache=use_cache) class Author: - def __init__(self, name, debug=False): + def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5): self.logger = getLogger("Author") if debug: self.logger.setLevel(DEBUG) self.name = name - self.fs = FigShare() + self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries) self.articles = {} self.public_html_prefix = "https://repository.lincoln.ac.uk" self.df = None @@ -395,7 +459,7 @@ def _flatten(self): def retrieve(self, use_cache=True): self._retrieve_figshare(use_cache=use_cache) self._remove_non_repository() - self._retrieve_details() + self._retrieve_details(use_cache=True) self._custom_fields_to_dicts() self._flatten() self._create_dataframe() @@ -441,9 +505,9 @@ def parse_args(): formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('-a', '--authors', nargs='+', - help='List of author names to process') + help='List of author names to process (uses default list if not specified)') parser.add_argument('-f', '--authors-file', type=str, - help='Path to file containing list of authors (one per line)') + help='Path to file containing list of authors, one per line (uses default list if not specified)') parser.add_argument('-s', '--since', type=str, default='2021-01-01', help='Process only publications since this date (YYYY-MM-DD)') parser.add_argument('-o', '--output', type=str, default='figshare_articles.csv', @@ -452,8 +516,12 @@ def parse_args(): help='Output CSV filename for all publications by authors (includes duplicates when multiple authors per output)') # parser.add_argument('-r', '--recent-output', type=str, default='figshare_articles_recent.csv', # help='Output CSV filename for publications since specified date') - parser.add_argument('--force-refresh', action='store_true', - help='Force refresh data instead of loading from cache') + parser.add_argument('--use-author-cache', action='store_true', + help='Use cached author data instead of refreshing from API') + parser.add_argument('--rate-limit-delay', type=float, default=1.0, + help='Delay in seconds between Figshare API requests (default: 1.0)') + parser.add_argument('--max-retries', type=int, default=1, + help='Maximum number of retry attempts for 403 errors (default: 1)') parser.add_argument('--debug', action='store_true', help='Enable debug logging') @@ -514,15 +582,15 @@ def figshare_processing(): for author_name in authors_list: logger.info(f"*** Processing {author_name}...") - authors[author_name] = Author(author_name, debug=args.debug) + authors[author_name] = Author(author_name, debug=args.debug, rate_limit_delay=args.rate_limit_delay, max_retries=args.max_retries) cache_exists = os.path.exists(f"{author_name}.db") - if cache_exists and not args.force_refresh: + if cache_exists and args.use_author_cache: logger.info(f"Loading cached data for {author_name}") authors[author_name].load() else: logger.info(f"Retrieving data for {author_name}") - authors[author_name].retrieve(not args.force_refresh) + authors[author_name].retrieve(args.use_author_cache) authors[author_name].save() if authors[author_name].df is not None: diff --git a/generate_stats.py b/generate_stats.py new file mode 100755 index 0000000..653eac1 --- /dev/null +++ b/generate_stats.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Generate publication statistics from figshare articles CSV. +Outputs a markdown table showing publications per author per year. +""" + +import pandas as pd +import sys +import argparse +from pathlib import Path + +def generate_statistics(all_csv='figshare_articles_all.csv', dedup_csv='figshare_articles.csv'): + """ + Read the figshare articles CSVs and generate statistics. + + Args: + all_csv: CSV file with all publications (includes duplicates for multi-author papers) + dedup_csv: CSV file with deduplicated publications (for calculating true totals) + + Returns: + A markdown table string showing statistics. + """ + try: + # Read the per-author CSV file (includes duplicates for multi-author papers) + df_all = pd.read_csv(all_csv) + + # Read the deduplicated CSV file (for accurate totals) + df_dedup = pd.read_csv(dedup_csv) + + if df_all.empty: + return "No publication data available." + + # Ensure we have the required columns + if 'author' not in df_all.columns or 'online_year' not in df_all.columns: + return "Error: Required columns (author, online_year) not found in all articles CSV." + + if 'online_year' not in df_dedup.columns: + return "Error: Required column (online_year) not found in deduplicated CSV." + + # Group by author and year, count publications per author + stats = df_all.groupby(['author', 'online_year']).size().reset_index(name='count') + + # Pivot to get years as columns + pivot = stats.pivot(index='author', columns='online_year', values='count').fillna(0).astype(int) + + # Sort columns (years) in descending order (most recent first) + pivot = pivot[sorted(pivot.columns, reverse=True)] + + # Calculate total per author (from their individual publications) + pivot['Total'] = pivot.sum(axis=1) + + # Sort by total publications (descending) + pivot = pivot.sort_values('Total', ascending=False) + + # Calculate actual yearly totals from deduplicated data + dedup_by_year = df_dedup.groupby('online_year').size() + + # Generate markdown table + md_lines = ["# Publication Statistics by Author and Year", ""] + md_lines.append(f"**Total Authors:** {len(pivot)}\n") + md_lines.append(f"**Total Publications (deduplicated):** {len(df_dedup)}\n") + md_lines.append("") + + # Create table header + headers = ['**Author**', '**Total**'] + [str(year) for year in pivot.columns if year != 'Total'] + md_lines.append('| ' + ' | '.join(headers) + ' |') + md_lines.append('| ' + ' | '.join(['---' for _ in headers]) + ' |') + + # Create table rows + for author, row in pivot.iterrows(): + values = [f"**{author}**", f"**{int(row['Total'])}**"] + [str(int(row[year])) if row[year] > 0 else '-' for year in pivot.columns if year != 'Total'] + md_lines.append('| ' + ' | '.join(values) + ' |') + + # Add yearly totals row using deduplicated data + year_columns = [year for year in pivot.columns if year != 'Total'] + year_totals = ['**Total (unique)**', f"**{len(df_dedup)}**"] + [str(int(dedup_by_year.get(year, 0))) for year in year_columns] + md_lines.append('| ' + ' | '.join(year_totals) + ' |') + + return '\n'.join(md_lines) + + except FileNotFoundError as e: + return f"Error: File not found - {e.filename}" + except Exception as e: + return f"Error generating statistics: {str(e)}" + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate publication statistics from FigShare articles CSV files.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + '--all-csv', + type=str, + default='figshare_articles_all.csv', + help='Path to CSV file with all publications (includes duplicates for multi-author papers)' + ) + parser.add_argument( + '--dedup-csv', + type=str, + default='figshare_articles.csv', + help='Path to CSV file with deduplicated publications (for accurate total counts)' + ) + + args = parser.parse_args() + + # Generate and print statistics + stats = generate_statistics(args.all_csv, args.dedup_csv) + print(stats) +