diff --git a/.github/workflows/figshare-processing.yaml b/.github/workflows/figshare-processing.yaml
index 53fad7e..67dd566 100644
--- a/.github/workflows/figshare-processing.yaml
+++ b/.github/workflows/figshare-processing.yaml
@@ -2,8 +2,17 @@ name: figshare-cache
 
 on:
   workflow_dispatch:
+    inputs:
+      use_author_cache:
+        description: 'Use cached author data (instead of refreshing)'
+        required: false
+        default: 'false'
+        type: choice
+        options:
+          - 'true'
+          - 'false'
   schedule:
-    - cron: "30 2 * * 2"
+    - cron: "30 */4 * * *"
   push:
     branches:
       - main
@@ -20,13 +29,18 @@ jobs:
           fetch-depth: 1
 
       - name: Use Cache in folder ./output
-        uses: actions/cache@v3
+        id: cache-restore-output
+        uses: actions/cache/restore@v5
         with:
           path: ./output
-          key: cache-files
+          key: cache-files-${{ github.run_id }}
+          restore-keys: |
+            cache-files-
 
       - name: Create output directory if it doesn't exist
-        run: mkdir -p output
+        run: |
+          mkdir -p output
+          find ./output 
 
       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
 
@@ -46,11 +60,30 @@ jobs:
           pip install -r requirements-frozen.txt
           
       - name: Run figshare exporter
+        env:
+          FIGSHARE_TOKEN: ${{ secrets.FIGSHARE_TOKEN }}
         run: |
           set -e
           cd ./output
-          python ../figshare.py --force-refresh
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
+            echo "Running with --use-author-cache (manually triggered)"
+            python ../figshare.py --use-author-cache
+          else
+            echo "Running without cache (default behavior)"
+            python ../figshare.py
+          fi
 
+      - name: Save Cache from folder ./output
+        uses: actions/cache/save@v5
+        if: always()
+        with:
+          path: ./output
+          key: ${{ steps.cache-restore-output.outputs.cache-primary-key || 'cache-files' }}
+
+      - name: Generate publication statistics
+        run: |
+          cd ./output
+          python ../generate_stats.py --all-csv figshare_articles_all.csv --dedup-csv figshare_articles.csv >> $GITHUB_STEP_SUMMARY
 
       - name: Nexus Repo Publish bibtex
         if: ${{ github.event_name != 'pull_request' }}
@@ -97,6 +130,6 @@ jobs:
         with:
           name: outputs
           path: |
-            ./output/*.csv 
+            ./output/*.csv
             ./output/*.bib
           retention-days: 30
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..edf95d4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,187 @@
+# LCAS EPrint Cache
+
+This repository automatically exports and caches publication data from Figshare for LCAS (Lincoln Centre for Autonomous Systems) researchers.
+
+## Overview
+
+The system:
+- Retrieves publication metadata from Figshare repository
+- Processes author information and generates BibTeX entries
+- Exports data in CSV and BibTeX formats
+- Publishes to Nexus repository for public access
+
+## Setup
+
+### Prerequisites
+
+- Python 3.10+
+- Figshare API token (required)
+
+### Configuration
+
+#### Figshare API Token
+
+This application requires a Figshare API token to function properly. To set up:
+
+1. **Create a Figshare account**: Visit [https://figshare.com](https://figshare.com) and create an account
+2. **Generate an API token**:
+   - Log in to Figshare
+   - Go to Account Settings → Applications
+   - Create a new personal token
+   - Copy the token securely
+3. **For local development**: Set the environment variable
+   ```bash
+   export FIGSHARE_TOKEN="your_token_here"
+   ```
+4. **For GitHub Actions**: Add the token as a repository secret named `FIGSHARE_TOKEN`
+   - Go to repository Settings → Secrets and variables → Actions
+   - Create a new secret named `FIGSHARE_TOKEN`
+   - Paste your Figshare API token
+
+**Note**: Without a valid API token, requests to the Figshare API will fail with 403 errors.
+
+### Installation
+
+```bash
+# Install dependencies
+pip install -r requirements-frozen.txt
+```
+
+## Usage
+
+### Command Line
+
+```bash
+# Run with default authors list
+python figshare.py
+
+# Run with specific authors
+python figshare.py --authors "Marc Hanheide" "Tom Duckett"
+
+# Run with authors from file
+python figshare.py --authors-file staff.json
+
+# Force refresh (ignore cache)
+python figshare.py --force-refresh
+
+# Adjust rate limiting (default is 1 second delay between requests)
+python figshare.py --rate-limit-delay 2.0
+
+# Enable debug logging
+python figshare.py --debug
+
+# Custom output filenames
+python figshare.py --output my_articles.csv --output-all my_articles_all.csv
+```
+
+### Arguments
+
+- `-a, --authors`: List of author names to process
+- `-f, --authors-file`: Path to file containing author names (one per line)
+- `-s, --since`: Process only publications since this date (YYYY-MM-DD), default: 2021-01-01
+- `-o, --output`: Output CSV filename for deduplicated publications, default: figshare_articles.csv
+- `-O, --output-all`: Output CSV filename for all publications (with duplicates), default: figshare_articles_all.csv
+- `--force-refresh`: Force refresh data instead of loading from cache
+- `--rate-limit-delay`: Delay in seconds between Figshare API requests, default: 1.0
+- `--debug`: Enable debug logging
+
+## Output Files
+
+The script generates several output files:
+
+- `lcas.bib`: Combined BibTeX file with all publications (deduplicated)
+- `figshare_articles.csv`: CSV with deduplicated articles
+- `figshare_articles_all.csv`: CSV with all articles (includes duplicates when multiple authors)
+- `{author_name}.bib`: Individual BibTeX files per author
+- `{author_name}.csv`: Individual CSV files per author
+- `{author_name}.db`: Cached data per author (shelve database)
+
+## Cache Files
+
+The application uses several cache files to minimize API calls:
+
+- `figshare_cache.pkl`: Cached Figshare API responses
+- `bibtext_cache`: Cached BibTeX entries from DOI lookups
+- `shortdoi_cache`: Cached short DOI mappings
+- `crossref_cache.db`: Cached Crossref API responses for DOI guessing
+
+## GitHub Actions Workflow
+
+The workflow runs automatically:
+- Weekly on Tuesdays at 02:30 UTC (uses cache by default)
+- On push to main branch (uses cache by default)
+- On pull requests (uses cache by default)
+- Can be manually triggered via workflow_dispatch with optional force refresh
+
+### Manual Workflow Trigger
+
+When manually triggering the workflow:
+1. Go to Actions → figshare-cache workflow
+2. Click "Run workflow"
+3. Choose whether to force refresh:
+   - **false** (default): Uses cached data, faster and respects rate limits
+   - **true**: Ignores cache and fetches fresh data from Figshare API
+
+**Note**: Force refresh should only be used when you need to ensure the latest data, as it makes many API requests and takes longer to complete.
+
+### Workflow Steps
+
+1. Checkout repository
+2. Restore cache
+3. Install Python dependencies
+4. Run Figshare exporter (with or without --force-refresh based on trigger)
+5. Publish results to Nexus repository
+6. Upload artifacts
+
+### Rate Limiting
+
+The script includes built-in rate limiting with a 1-second delay between API requests to avoid hitting Figshare API rate limits. This helps ensure reliable operation even with authenticated requests.
+
+## Troubleshooting
+
+### 403 Forbidden Errors
+
+If you encounter 403 errors when accessing the Figshare API:
+1. Ensure the `FIGSHARE_TOKEN` environment variable is set
+2. Verify the token is valid and hasn't expired
+3. Check that the token has appropriate permissions (read access to public articles)
+
+For detailed information about the 403 error and resolution steps, see [FIGSHARE_API_RESEARCH.md](FIGSHARE_API_RESEARCH.md).
+
+### Empty Results
+
+If no articles are found:
+- Check that author names match exactly as they appear in Figshare
+- Verify the articles are in the Lincoln repository (https://repository.lincoln.ac.uk)
+- Use `--debug` flag for detailed logging
+
+### JSON Decode Errors
+
+The application includes validation for JSON responses. If issues persist:
+- Check your internet connection
+- Verify Figshare API is accessible
+- Review logs for specific error messages
+
+## Development
+
+### Running Tests
+
+```bash
+# Run with a single test author
+python figshare.py --authors "Marc Hanheide" --debug
+```
+
+### Code Structure
+
+- `figshare.py`: Main script with FigShare API client and processing logic
+- `doi2bib`: Class for DOI to BibTeX conversion
+- `FigShare`: Class for Figshare API interactions
+- `Author`: Class for author-specific processing
+
+## License
+
+[Add license information here]
+
+## Contact
+
+For issues or questions, please open an issue in the GitHub repository.
diff --git a/figshare.py b/figshare.py
index 7725139..8ea2c14 100644
--- a/figshare.py
+++ b/figshare.py
@@ -5,12 +5,9 @@
 from json import loads
 from pprint import pformat
 import pandas as pd
-from functools import lru_cache, wraps
-from datetime import datetime
 
 from logging import getLogger, basicConfig, INFO, DEBUG
 import os
-from pickle import load, dump
 
 from flatten_dict import flatten
 
@@ -27,6 +24,7 @@
 import argparse
 from datetime import datetime
 from difflib import SequenceMatcher
+import time
 
 
 basicConfig(level=INFO)
@@ -120,29 +118,28 @@ def entries_to_str(self, entries):
 
 
 class FigShare:
-    def __init__(self, page_size=100):
+    def __init__(self, page_size=100, rate_limit_delay=1.0, max_retries=5):
         self.logger = getLogger("FigShare")
         self.token = os.getenv('FIGSHARE_TOKEN')
+        if self.token:
+            self.logger.info("Figshare API: Using authenticated requests")
+        else:
+            self.logger.warning("Figshare API: No authentication token found - using anonymous requests (may hit rate limits or receive 403 errors)")
         self.page_size = page_size
+        self.rate_limit_delay = rate_limit_delay
+        self.max_retries = max_retries
         self.base_url = "https://api.figshare.com/v2"
+        
+        if self.rate_limit_delay > 0:
+            self.logger.info(f"Rate limiting enabled: {self.rate_limit_delay} second delay between API requests")
 
-        # if cache file exist, load it
-        self.cache_file = "figshare_cache.pkl"
-        if os.path.exists(self.cache_file):
-            try:
-                with open(self.cache_file, "rb") as f:
-                    self.__cache = load(f)
-                self.logger.debug(f"Loaded cache from {self.cache_file} with {len(self.__cache)} entries")
-            except Exception as e:
-                self.logger.warning(f"Failed to load cache: {e}")
-                self.__cache = {}
-        else:
-            self.logger.info(f"No cache file found at {self.cache_file}")
-            self.__cache = {}
+        # Use shelve for persistent caching
+        self.cache_file = "figshare_cache.db"
 
-    def save_cache(self):
-        with open(self.cache_file,"wb") as f:
-            dump(self.__cache, f)
+        with shelve.open(self.cache_file) as cache:
+            self.logger.info(f"Figshare API: Using cache file {self.cache_file} with {len(cache.keys())} entries")
+            for key in list(cache.keys()):
+                self.logger.debug(f"  existing cache key: {key}")
 
 
     def __init_params(self):
@@ -150,35 +147,102 @@ def __init_params(self):
             "page_size": self.page_size
         }
 
-    def __get(self, url, params=None, use_cache=True):
-        hash_key = f"GET{url}?{params}"
-        if hash_key in self.__cache and use_cache:
-            return self.__cache[hash_key]
+    def __handle_403_error(self, url, method="GET", response_text=""):
+        """Handle 403 Forbidden errors with helpful messages"""
+        if not self.token:
+            self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: "
+                            f"Authentication required. Set FIGSHARE_TOKEN environment variable. "
+                            f"See README.md for instructions.")
         else:
+            self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: "
+                            f"Token may be invalid or lack permissions. "
+                            f"Check token in Figshare account settings.")
+        if response_text:
+            self.logger.error(f"Response text: {response_text}")
+
+    def __get(self, url, params=None, use_cache=True):
+        hash_key = f"GET{url}{'?' + str(params) if params else ''}"
+        
+        with shelve.open(self.cache_file) as cache:
+            if hash_key in cache and use_cache:
+                self.logger.info(f"Cache hit for GET {url}")
+                return cache[hash_key]
+            
             headers = { "Authorization": "token " + self.token } if self.token else {}
-            response = get(self.base_url + url, headers=headers, params=params)
+            
+            # Retry logic for 403 errors
+            for attempt in range(self.max_retries):
+                response = get(self.base_url + url, headers=headers, params=params)
+                
+                # Handle 403 Forbidden errors with retry logic
+                if response.status_code == 403:
+                    if attempt < self.max_retries - 1:
+                        # Exponential backoff: 1s, 2s, 4s, 8s, 16s
+                        wait_time = 2 ** attempt
+                        self.logger.warning(f"403 Forbidden for GET {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...")
+                        time.sleep(wait_time)
+                        continue
+                    else:
+                        # Final attempt failed, log error and return
+                        self.__handle_403_error(url, "GET", response.text)
+                        return {}
+                
+                # Success - break out of retry loop
+                break
+
+            # Rate limiting: sleep after each API request
+            if self.rate_limit_delay > 0:
+                time.sleep(self.rate_limit_delay)
+            
             # Check if response is valid and contains JSON
             if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip():
                 result = response.json()
-                self.__cache[hash_key] = result
-                self.save_cache()
+                cache[hash_key] = result
+                self.logger.debug(f"Cached result for GET {url}")
                 return result
             else:
                 self.logger.warning(f"Received empty or invalid JSON response for GET {self.base_url + url} (status: {response.status_code})")
                 return {}
 
     def __post(self, url, params=None, use_cache=True):
-        hash_key = f"POST{url}?{params}"
-        if hash_key in self.__cache and use_cache:
-            return self.__cache[hash_key]
-        else:
+        hash_key = f"POST{url}{'?' + str(params) if params else ''}"
+        
+        with shelve.open(self.cache_file) as cache:
+            if hash_key in cache and use_cache:
+                self.logger.debug(f"Cache hit for POST {url}")
+                return cache[hash_key]
+            
             headers = { "Authorization": "token " + self.token } if self.token else {}
-            response = post(self.base_url + url, headers=headers, json=params)
+            
+            # Retry logic for 403 errors
+            for attempt in range(self.max_retries):
+                response = post(self.base_url + url, headers=headers, json=params)
+                
+                # Handle 403 Forbidden errors with retry logic
+                if response.status_code == 403:
+                    if attempt < self.max_retries - 1:
+                        # Exponential backoff: 1s, 2s, 4s, 8s, 16s
+                        wait_time = 2 ** attempt
+                        self.logger.warning(f"403 Forbidden for POST {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...")
+                        time.sleep(wait_time)
+                        continue
+                    else:
+                        # Final attempt failed, log error and return
+                        self.__handle_403_error(url, "POST", response.text)
+                        return []
+                
+                # Success - break out of retry loop
+                break
+            
+            # Rate limiting: sleep after each API request
+            if self.rate_limit_delay > 0:
+                time.sleep(self.rate_limit_delay)
+            
             # Check if response is valid and contains JSON
             if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip():
                 result = response.json()
-                self.__cache[hash_key] = result
-                self.save_cache()
+                cache[hash_key] = result
+                self.logger.debug(f"Cached result for POST {url}")
                 return result
             else:
                 self.logger.warning(f"Received empty or invalid JSON response for POST {self.base_url + url} (status: {response.status_code})")
@@ -206,12 +270,12 @@ def get_article(self, article_id, use_cache=True):
         return self.__get(f"/articles/{article_id}", use_cache=use_cache)
 
 class Author:
-    def __init__(self, name, debug=False):
+    def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5):
         self.logger = getLogger("Author")
         if debug:
             self.logger.setLevel(DEBUG)
         self.name = name
-        self.fs = FigShare()
+        self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries)
         self.articles = {}
         self.public_html_prefix = "https://repository.lincoln.ac.uk"
         self.df = None
@@ -395,7 +459,7 @@ def _flatten(self):
     def retrieve(self, use_cache=True):
         self._retrieve_figshare(use_cache=use_cache)
         self._remove_non_repository()
-        self._retrieve_details()
+        self._retrieve_details(use_cache=True)
         self._custom_fields_to_dicts()
         self._flatten()
         self._create_dataframe()
@@ -441,9 +505,9 @@ def parse_args():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
     parser.add_argument('-a', '--authors', nargs='+', 
-                        help='List of author names to process')
+                        help='List of author names to process (uses default list if not specified)')
     parser.add_argument('-f', '--authors-file', type=str,
-                        help='Path to file containing list of authors (one per line)')
+                        help='Path to file containing list of authors, one per line (uses default list if not specified)')
     parser.add_argument('-s', '--since', type=str, default='2021-01-01',
                         help='Process only publications since this date (YYYY-MM-DD)')
     parser.add_argument('-o', '--output', type=str, default='figshare_articles.csv',
@@ -452,8 +516,12 @@ def parse_args():
                         help='Output CSV filename for all publications by authors (includes duplicates when multiple authors per output)')
     # parser.add_argument('-r', '--recent-output', type=str, default='figshare_articles_recent.csv',
     #                     help='Output CSV filename for publications since specified date')
-    parser.add_argument('--force-refresh', action='store_true',
-                        help='Force refresh data instead of loading from cache')
+    parser.add_argument('--use-author-cache', action='store_true',
+                        help='Use cached author data instead of refreshing from API')
+    parser.add_argument('--rate-limit-delay', type=float, default=1.0,
+                        help='Delay in seconds between Figshare API requests (default: 1.0)')
+    parser.add_argument('--max-retries', type=int, default=1,
+                        help='Maximum number of retry attempts for 403 errors (default: 1)')
     parser.add_argument('--debug', action='store_true',
                         help='Enable debug logging')
     
@@ -514,15 +582,15 @@ def figshare_processing():
     for author_name in authors_list:
         logger.info(f"*** Processing {author_name}...")
         
-        authors[author_name] = Author(author_name, debug=args.debug)
+        authors[author_name] = Author(author_name, debug=args.debug, rate_limit_delay=args.rate_limit_delay, max_retries=args.max_retries)
         cache_exists = os.path.exists(f"{author_name}.db")
         
-        if cache_exists and not args.force_refresh:
+        if cache_exists and args.use_author_cache:
             logger.info(f"Loading cached data for {author_name}")
             authors[author_name].load()
         else:
             logger.info(f"Retrieving data for {author_name}")
-            authors[author_name].retrieve(not args.force_refresh)
+            authors[author_name].retrieve(args.use_author_cache)
             authors[author_name].save()
             
         if authors[author_name].df is not None:
diff --git a/generate_stats.py b/generate_stats.py
new file mode 100755
index 0000000..653eac1
--- /dev/null
+++ b/generate_stats.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Generate publication statistics from figshare articles CSV.
+Outputs a markdown table showing publications per author per year.
+"""
+
+import pandas as pd
+import sys
+import argparse
+from pathlib import Path
+
+def generate_statistics(all_csv='figshare_articles_all.csv', dedup_csv='figshare_articles.csv'):
+    """
+    Read the figshare articles CSVs and generate statistics.
+    
+    Args:
+        all_csv: CSV file with all publications (includes duplicates for multi-author papers)
+        dedup_csv: CSV file with deduplicated publications (for calculating true totals)
+    
+    Returns:
+        A markdown table string showing statistics.
+    """
+    try:
+        # Read the per-author CSV file (includes duplicates for multi-author papers)
+        df_all = pd.read_csv(all_csv)
+        
+        # Read the deduplicated CSV file (for accurate totals)
+        df_dedup = pd.read_csv(dedup_csv)
+        
+        if df_all.empty:
+            return "No publication data available."
+        
+        # Ensure we have the required columns
+        if 'author' not in df_all.columns or 'online_year' not in df_all.columns:
+            return "Error: Required columns (author, online_year) not found in all articles CSV."
+        
+        if 'online_year' not in df_dedup.columns:
+            return "Error: Required column (online_year) not found in deduplicated CSV."
+        
+        # Group by author and year, count publications per author
+        stats = df_all.groupby(['author', 'online_year']).size().reset_index(name='count')
+        
+        # Pivot to get years as columns
+        pivot = stats.pivot(index='author', columns='online_year', values='count').fillna(0).astype(int)
+        
+        # Sort columns (years) in descending order (most recent first)
+        pivot = pivot[sorted(pivot.columns, reverse=True)]
+        
+        # Calculate total per author (from their individual publications)
+        pivot['Total'] = pivot.sum(axis=1)
+        
+        # Sort by total publications (descending)
+        pivot = pivot.sort_values('Total', ascending=False)
+        
+        # Calculate actual yearly totals from deduplicated data
+        dedup_by_year = df_dedup.groupby('online_year').size()
+        
+        # Generate markdown table
+        md_lines = ["# Publication Statistics by Author and Year", ""]
+        md_lines.append(f"**Total Authors:** {len(pivot)}\n")
+        md_lines.append(f"**Total Publications (deduplicated):** {len(df_dedup)}\n")
+        md_lines.append("")
+        
+        # Create table header
+        headers = ['**Author**', '**Total**'] + [str(year) for year in pivot.columns if year != 'Total']
+        md_lines.append('| ' + ' | '.join(headers) + ' |')
+        md_lines.append('| ' + ' | '.join(['---' for _ in headers]) + ' |')
+        
+        # Create table rows
+        for author, row in pivot.iterrows():
+            values = [f"**{author}**", f"**{int(row['Total'])}**"] + [str(int(row[year])) if row[year] > 0 else '-' for year in pivot.columns if year != 'Total']
+            md_lines.append('| ' + ' | '.join(values) + ' |')
+        
+        # Add yearly totals row using deduplicated data
+        year_columns = [year for year in pivot.columns if year != 'Total']
+        year_totals = ['**Total (unique)**', f"**{len(df_dedup)}**"] + [str(int(dedup_by_year.get(year, 0))) for year in year_columns]
+        md_lines.append('| ' + ' | '.join(year_totals) + ' |')
+        
+        return '\n'.join(md_lines)
+    
+    except FileNotFoundError as e:
+        return f"Error: File not found - {e.filename}"
+    except Exception as e:
+        return f"Error generating statistics: {str(e)}"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate publication statistics from FigShare articles CSV files.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        '--all-csv',
+        type=str,
+        default='figshare_articles_all.csv',
+        help='Path to CSV file with all publications (includes duplicates for multi-author papers)'
+    )
+    parser.add_argument(
+        '--dedup-csv',
+        type=str,
+        default='figshare_articles.csv',
+        help='Path to CSV file with deduplicated publications (for accurate total counts)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Generate and print statistics
+    stats = generate_statistics(args.all_csv, args.dedup_csv)
+    print(stats)
+