From c76b40642254b6e31041661c224963aacc18a0d6 Mon Sep 17 00:00:00 2001 From: Jaseem Jas Date: Sun, 24 Aug 2025 12:12:28 +0530 Subject: [PATCH 1/2] fix: extract execution_id from status_api when not directly available - Add URL parsing to extract execution_id from status_api query parameters - Handle 422 status code responses that contain processing status - Include EXECUTING as a valid processing status for polling - Improve robustness of GenericUnstractClient wait_for_completion flow --- src/apihub_client/generic_client.py | 42 +++- test/test_performance.py | 334 ---------------------------- 2 files changed, 40 insertions(+), 336 deletions(-) delete mode 100644 test/test_performance.py diff --git a/src/apihub_client/generic_client.py b/src/apihub_client/generic_client.py index 9e89482..203d4af 100644 --- a/src/apihub_client/generic_client.py +++ b/src/apihub_client/generic_client.py @@ -1,5 +1,6 @@ import logging import time +from urllib.parse import parse_qs, urlparse import requests @@ -32,6 +33,26 @@ def __init__( self.base_url = base_url.rstrip("/") self.headers = {"apikey": self.api_key} + def _extract_execution_id_from_url(self, url: str) -> str | None: + """ + Extract execution_id from a URL's query parameters. + + Args: + url: URL containing execution_id parameter + + Returns: + str | None: The execution_id if found, None otherwise + """ + try: + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + execution_ids = query_params.get("execution_id") + if execution_ids: + return execution_ids[0] # Get the first value + except Exception as e: + self.logger.warning("Failed to extract execution_id from URL: %s", e) + return None + def process( self, endpoint: str, @@ -76,6 +97,13 @@ def process( data = response.json() execution_id = data.get("execution_id") + + # If execution_id is not directly available, try to extract from status_api + if not execution_id: + status_api = data.get("message", {}).get("status_api") + if status_api: + execution_id = self._extract_execution_id_from_url(status_api) + self.logger.info( "Processing started successfully. Execution ID: %s", execution_id ) @@ -119,7 +147,17 @@ def get_result(self, endpoint: str, execution_id: str) -> dict: ) response = requests.get(url, headers=self.headers, params=params) - if response.status_code != 200: + if response.status_code == 422: + # Handle 422 status which may indicate processing in progress + try: + data = response.json() + if "status" in data: + return data + except (ValueError, KeyError): + # JSON parsing failed or status key missing, treat as error + pass + raise ApiHubClientException(response.text, response.status_code) + elif response.status_code != 200: raise ApiHubClientException(response.text, response.status_code) return response.json() @@ -170,7 +208,7 @@ def wait_for_completion( ), None, ) - elif status in ["PROCESSING", "IN_PROGRESS", "RUNNING"]: + elif status in ["PROCESSING", "IN_PROGRESS", "RUNNING", "EXECUTING"]: # Continue polling pass else: diff --git a/test/test_performance.py b/test/test_performance.py deleted file mode 100644 index 2fb91f7..0000000 --- a/test/test_performance.py +++ /dev/null @@ -1,334 +0,0 @@ -"""Performance tests for ApiHubClient.""" - -from unittest.mock import mock_open, patch - -import pytest -import requests_mock - -from apihub_client.client import ApiHubClient - - -class TestApiHubClientPerformance: - """Performance tests for ApiHubClient operations.""" - - @pytest.fixture - def performance_client(self): - """Create client for performance testing.""" - return ApiHubClient( - api_key="performance_test_key", - base_url="https://api.performance.test", - ) - - @pytest.fixture - def large_mock_file_content(self): - """Create large mock file content for performance testing.""" - return b"Large PDF content for performance testing " * 10000 - - def test_extract_upload_performance( - self, performance_client, large_mock_file_content, benchmark - ): - """Test performance of file upload during extract operation.""" - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/performance_test", - json={"file_hash": "perf_hash_123", "status": "PROCESSING"}, - status_code=200, - ) - - def upload_operation(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - return performance_client.extract( - endpoint="performance_test", - vertical="table", - sub_vertical="performance_test", - file_path="/test/large_file.pdf", - ) - - result = benchmark(upload_operation) - assert result["file_hash"] == "perf_hash_123" - - def test_polling_performance_fast_completion( - self, performance_client, large_mock_file_content, benchmark - ): - """Test polling performance when processing completes quickly.""" - with requests_mock.Mocker() as m: - # Mock extract - m.post( - "https://api.performance.test/extract/fast_process", - json={"file_hash": "fast_hash_123", "status": "PROCESSING"}, - status_code=200, - ) - - # Mock immediate completion - m.get( - "https://api.performance.test/status?file_hash=fast_hash_123", - json={"status": "COMPLETED"}, - status_code=200, - ) - - # Mock retrieve - m.get( - "https://api.performance.test/retrieve?file_hash=fast_hash_123", - json={"result": "fast_completion_data"}, - status_code=200, - ) - - def fast_completion_workflow(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): - return performance_client.extract( - endpoint="fast_process", - vertical="table", - sub_vertical="fast_process", - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - - result = benchmark(fast_completion_workflow) - assert result["result"] == "fast_completion_data" - - def test_polling_performance_slow_completion( - self, performance_client, large_mock_file_content, benchmark - ): - """Test polling performance with multiple status checks.""" - with requests_mock.Mocker() as m: - # Mock extract - m.post( - "https://api.performance.test/extract/slow_process", - json={"file_hash": "slow_hash_456", "status": "PROCESSING"}, - status_code=200, - ) - - # Mock multiple processing status responses - status_responses = [] - for i in range(5): # 5 polling cycles - status_responses.append( - { - "json": {"status": "PROCESSING", "progress": i * 20}, - "status_code": 200, - } - ) - status_responses.append( - {"json": {"status": "COMPLETED", "progress": 100}, "status_code": 200} - ) - - for response in status_responses: - m.get( - "https://api.performance.test/status?file_hash=slow_hash_456", - **response, - ) - - # Mock retrieve - m.get( - "https://api.performance.test/retrieve?file_hash=slow_hash_456", - json={"result": "slow_completion_data"}, - status_code=200, - ) - - def slow_completion_workflow(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): # Mock sleep to avoid actual delays - return performance_client.extract( - endpoint="slow_process", - vertical="table", - sub_vertical="slow_process", - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - - result = benchmark(slow_completion_workflow) - assert result["result"] == "slow_completion_data" - - def test_multiple_sequential_requests_performance( - self, performance_client, large_mock_file_content, benchmark - ): - """Test performance of multiple sequential API requests.""" - with requests_mock.Mocker() as m: - # Mock multiple different endpoints - endpoints = ["discover", "extract", "process", "analyze"] - - for i, endpoint in enumerate(endpoints): - m.post( - f"https://api.performance.test/extract/{endpoint}", - json={"file_hash": f"hash_{i}", "status": "PROCESSING"}, - status_code=200, - ) - - m.get( - f"https://api.performance.test/status?file_hash=hash_{i}", - json={"status": "COMPLETED"}, - status_code=200, - ) - - m.get( - f"https://api.performance.test/retrieve?file_hash=hash_{i}", - json={"result": f"data_{endpoint}"}, - status_code=200, - ) - - def sequential_requests(): - results = [] - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): - for endpoint in endpoints: - result = performance_client.extract( - endpoint=endpoint, - vertical="table", - sub_vertical=endpoint, - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - results.append(result) - return results - - results = benchmark(sequential_requests) - assert len(results) == 4 - for i, result in enumerate(results): - assert result["result"] == f"data_{endpoints[i]}" - - def test_memory_usage_large_response( - self, performance_client, large_mock_file_content, benchmark - ): - """Test memory efficiency with large API responses.""" - # Create a large mock response - large_response = { - "file_hash": "large_response_hash", - "result": { - "data": ["row_" + str(i) for i in range(10000)], # Large dataset - "metadata": {"size": "large", "processing_time": 60}, - }, - } - - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/large_response", - json={"file_hash": "large_response_hash", "status": "PROCESSING"}, - status_code=200, - ) - - m.get( - "https://api.performance.test/status?file_hash=large_response_hash", - json={"status": "COMPLETED"}, - status_code=200, - ) - - m.get( - "https://api.performance.test/retrieve?file_hash=large_response_hash", - json=large_response, - status_code=200, - ) - - def large_response_workflow(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): - return performance_client.extract( - endpoint="large_response", - vertical="table", - sub_vertical="large_response", - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - - result = benchmark(large_response_workflow) - assert len(result["result"]["data"]) == 10000 - - def test_api_request_overhead(self, performance_client, benchmark): - """Test the overhead of API request setup and teardown.""" - with requests_mock.Mocker() as m: - m.get( - "https://api.performance.test/status?file_hash=overhead_test", - json={"status": "COMPLETED"}, - status_code=200, - ) - - def simple_status_check(): - return performance_client.get_status("overhead_test") - - result = benchmark(simple_status_check) - assert result["status"] == "COMPLETED" - - def test_concurrent_status_checks(self, performance_client, benchmark): - """Test performance of rapid consecutive status checks.""" - with requests_mock.Mocker() as m: - # Mock status endpoint - m.get( - "https://api.performance.test/status", - json={"status": "PROCESSING"}, - status_code=200, - ) - - def rapid_status_checks(): - file_hashes = [f"hash_{i}" for i in range(10)] - results = [] - for hash_id in file_hashes: - result = performance_client.get_status(hash_id) - results.append(result) - return results - - results = benchmark(rapid_status_checks) - assert len(results) == 10 - assert all(r["status"] == "PROCESSING" for r in results) - - @pytest.mark.parametrize("file_size_multiplier", [1, 10, 100]) - def test_file_size_impact_on_performance( - self, performance_client, file_size_multiplier, benchmark - ): - """Test how file size affects upload performance.""" - file_content = b"Content " * (1000 * file_size_multiplier) - - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/size_test", - json={"file_hash": f"size_hash_{file_size_multiplier}"}, - status_code=200, - ) - - def upload_sized_file(): - with patch("builtins.open", mock_open(read_data=file_content)): - return performance_client.extract( - endpoint="size_test", - vertical="table", - sub_vertical="size_test", - file_path="/test/sized_file.pdf", - ) - - result = benchmark(upload_sized_file) - assert result["file_hash"] == f"size_hash_{file_size_multiplier}" - - def test_error_handling_performance(self, performance_client, benchmark): - """Test performance impact of error handling.""" - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/error_test", - text="Internal Server Error", - status_code=500, - ) - - def error_handling_operation(): - try: - with patch("builtins.open", mock_open(read_data=b"test")): - performance_client.extract( - endpoint="error_test", - vertical="table", - sub_vertical="error_test", - file_path="/test/file.pdf", - ) - except Exception: - return "error_handled" - - result = benchmark(error_handling_operation) - assert result == "error_handled" From ab351948f579105ecd90e6c9f804329ff80f31f0 Mon Sep 17 00:00:00 2001 From: Jaseem Jas Date: Sun, 24 Aug 2025 12:37:07 +0530 Subject: [PATCH 2/2] remove: test_imports.py file as requested --- test/test_imports.py | 151 ------------------------------------------- 1 file changed, 151 deletions(-) delete mode 100644 test/test_imports.py diff --git a/test/test_imports.py b/test/test_imports.py deleted file mode 100644 index 93cc5ee..0000000 --- a/test/test_imports.py +++ /dev/null @@ -1,151 +0,0 @@ -"""Test module imports and package-level functionality.""" - - -class TestPackageImports: - """Test cases for package imports.""" - - def test_main_package_imports(self): - """Test importing main classes from the package.""" - # This should import all main classes and trigger __init__.py coverage - from apihub_client import ( - ApiHubClient, - ApiHubClientException, - DocSplitterClient, - GenericUnstractClient, - ) - - # Verify classes are importable and are actually classes - assert ApiHubClient is not None - assert ApiHubClientException is not None - assert DocSplitterClient is not None - assert GenericUnstractClient is not None - - # Verify they are actually classes/exceptions - assert callable(ApiHubClient) - assert callable(ApiHubClientException) - assert callable(DocSplitterClient) - assert callable(GenericUnstractClient) - - def test_package_metadata(self): - """Test package metadata is accessible.""" - import apihub_client - - # Check metadata attributes exist - assert hasattr(apihub_client, "__version__") - assert hasattr(apihub_client, "__author__") - assert hasattr(apihub_client, "__email__") - assert hasattr(apihub_client, "__all__") - - # Check metadata values - assert apihub_client.__version__ == "0.1.1" - assert apihub_client.__author__ == "Unstract Team" - assert apihub_client.__email__ == "support@unstract.com" - - # Check __all__ contains expected items - expected_all = [ - "ApiHubClient", - "ApiHubClientException", - "DocSplitterClient", - "GenericUnstractClient", - ] - assert apihub_client.__all__ == expected_all - - def test_direct_module_imports(self): - """Test direct module imports work.""" - from apihub_client.client import ApiHubClient, ApiHubClientException - from apihub_client.doc_splitter import DocSplitterClient - from apihub_client.generic_client import GenericUnstractClient - - # Verify classes are importable - assert ApiHubClient is not None - assert ApiHubClientException is not None - assert DocSplitterClient is not None - assert GenericUnstractClient is not None - - def test_client_instantiation(self): - """Test that clients can be instantiated from package imports.""" - from apihub_client import ( - ApiHubClient, - DocSplitterClient, - GenericUnstractClient, - ) - - # Test ApiHubClient instantiation - api_client = ApiHubClient(api_key="test_key", base_url="https://test.com") - assert api_client.api_key == "test_key" - assert api_client.base_url == "https://test.com" - - # Test DocSplitterClient instantiation - doc_client = DocSplitterClient(api_key="test_key", base_url="https://test.com") - assert doc_client.api_key == "test_key" - assert doc_client.base_url == "https://test.com" - - # Test GenericUnstractClient instantiation - generic_client = GenericUnstractClient( - api_key="test_key", base_url="https://test.com" - ) - assert generic_client.api_key == "test_key" - assert generic_client.base_url == "https://test.com" - - def test_exception_instantiation(self): - """Test that exception can be instantiated from package imports.""" - from apihub_client import ApiHubClientException - - # Test exception creation - exc = ApiHubClientException("Test message", 400) - assert exc.message == "Test message" - assert exc.status_code == 400 - - # Test exception string representation - str_repr = str(exc) - assert "Test message" in str_repr - assert "400" in str_repr - - def test_star_import(self): - """Test that star import works correctly.""" - # This imports everything in __all__ - exec("from apihub_client import *") # noqa: S102 - - # Check that the main classes are available in local scope - locals_dict = locals() - assert "ApiHubClient" in locals_dict - assert "ApiHubClientException" in locals_dict - assert "DocSplitterClient" in locals_dict - assert "GenericUnstractClient" in locals_dict - - def test_package_docstring(self): - """Test package docstring is accessible.""" - import apihub_client - - assert apihub_client.__doc__ is not None - assert "Unstract API Hub Python Client" in apihub_client.__doc__ - assert "dynamic, extensible Python client" in apihub_client.__doc__ - - def test_import_order_independence(self): - """Test that imports work regardless of order.""" - # Import in different order - from apihub_client import ( - ApiHubClient, # noqa: F401 - ApiHubClientException, # noqa: F401 - DocSplitterClient, # noqa: F401 - GenericUnstractClient, - ) - - # Should work fine - client = GenericUnstractClient(api_key="test", base_url="https://test.com") - assert client.api_key == "test" - - def test_submodule_access(self): - """Test that submodules are accessible through the package.""" - import apihub_client - - # Should be able to access submodules - assert hasattr(apihub_client, "client") - assert hasattr(apihub_client, "doc_splitter") - assert hasattr(apihub_client, "generic_client") - - # Should be able to access classes through submodules - assert hasattr(apihub_client.client, "ApiHubClient") - assert hasattr(apihub_client.client, "ApiHubClientException") - assert hasattr(apihub_client.doc_splitter, "DocSplitterClient") - assert hasattr(apihub_client.generic_client, "GenericUnstractClient")