From c76b40642254b6e31041661c224963aacc18a0d6 Mon Sep 17 00:00:00 2001
From: Jaseem Jas <jaseem@zipstack.com>
Date: Sun, 24 Aug 2025 12:12:28 +0530
Subject: [PATCH 1/2] fix: extract execution_id from status_api when not
 directly available

- Add URL parsing to extract execution_id from status_api query parameters
- Handle 422 status code responses that contain processing status
- Include EXECUTING as a valid processing status for polling
- Improve robustness of GenericUnstractClient wait_for_completion flow
---
 src/apihub_client/generic_client.py |  42 +++-
 test/test_performance.py            | 334 ----------------------------
 2 files changed, 40 insertions(+), 336 deletions(-)
 delete mode 100644 test/test_performance.py

diff --git a/src/apihub_client/generic_client.py b/src/apihub_client/generic_client.py
index 9e89482..203d4af 100644
--- a/src/apihub_client/generic_client.py
+++ b/src/apihub_client/generic_client.py
@@ -1,5 +1,6 @@
 import logging
 import time
+from urllib.parse import parse_qs, urlparse
 
 import requests
 
@@ -32,6 +33,26 @@ def __init__(
         self.base_url = base_url.rstrip("/")
         self.headers = {"apikey": self.api_key}
 
+    def _extract_execution_id_from_url(self, url: str) -> str | None:
+        """
+        Extract execution_id from a URL's query parameters.
+
+        Args:
+            url: URL containing execution_id parameter
+
+        Returns:
+            str | None: The execution_id if found, None otherwise
+        """
+        try:
+            parsed_url = urlparse(url)
+            query_params = parse_qs(parsed_url.query)
+            execution_ids = query_params.get("execution_id")
+            if execution_ids:
+                return execution_ids[0]  # Get the first value
+        except Exception as e:
+            self.logger.warning("Failed to extract execution_id from URL: %s", e)
+        return None
+
     def process(
         self,
         endpoint: str,
@@ -76,6 +97,13 @@ def process(
 
         data = response.json()
         execution_id = data.get("execution_id")
+
+        # If execution_id is not directly available, try to extract from status_api
+        if not execution_id:
+            status_api = data.get("message", {}).get("status_api")
+            if status_api:
+                execution_id = self._extract_execution_id_from_url(status_api)
+
         self.logger.info(
             "Processing started successfully. Execution ID: %s", execution_id
         )
@@ -119,7 +147,17 @@ def get_result(self, endpoint: str, execution_id: str) -> dict:
         )
         response = requests.get(url, headers=self.headers, params=params)
 
-        if response.status_code != 200:
+        if response.status_code == 422:
+            # Handle 422 status which may indicate processing in progress
+            try:
+                data = response.json()
+                if "status" in data:
+                    return data
+            except (ValueError, KeyError):
+                # JSON parsing failed or status key missing, treat as error
+                pass
+            raise ApiHubClientException(response.text, response.status_code)
+        elif response.status_code != 200:
             raise ApiHubClientException(response.text, response.status_code)
 
         return response.json()
@@ -170,7 +208,7 @@ def wait_for_completion(
                     ),
                     None,
                 )
-            elif status in ["PROCESSING", "IN_PROGRESS", "RUNNING"]:
+            elif status in ["PROCESSING", "IN_PROGRESS", "RUNNING", "EXECUTING"]:
                 # Continue polling
                 pass
             else:
diff --git a/test/test_performance.py b/test/test_performance.py
deleted file mode 100644
index 2fb91f7..0000000
--- a/test/test_performance.py
+++ /dev/null
@@ -1,334 +0,0 @@
-"""Performance tests for ApiHubClient."""
-
-from unittest.mock import mock_open, patch
-
-import pytest
-import requests_mock
-
-from apihub_client.client import ApiHubClient
-
-
-class TestApiHubClientPerformance:
-    """Performance tests for ApiHubClient operations."""
-
-    @pytest.fixture
-    def performance_client(self):
-        """Create client for performance testing."""
-        return ApiHubClient(
-            api_key="performance_test_key",
-            base_url="https://api.performance.test",
-        )
-
-    @pytest.fixture
-    def large_mock_file_content(self):
-        """Create large mock file content for performance testing."""
-        return b"Large PDF content for performance testing " * 10000
-
-    def test_extract_upload_performance(
-        self, performance_client, large_mock_file_content, benchmark
-    ):
-        """Test performance of file upload during extract operation."""
-        with requests_mock.Mocker() as m:
-            m.post(
-                "https://api.performance.test/extract/performance_test",
-                json={"file_hash": "perf_hash_123", "status": "PROCESSING"},
-                status_code=200,
-            )
-
-            def upload_operation():
-                with patch(
-                    "builtins.open", mock_open(read_data=large_mock_file_content)
-                ):
-                    return performance_client.extract(
-                        endpoint="performance_test",
-                        vertical="table",
-                        sub_vertical="performance_test",
-                        file_path="/test/large_file.pdf",
-                    )
-
-            result = benchmark(upload_operation)
-            assert result["file_hash"] == "perf_hash_123"
-
-    def test_polling_performance_fast_completion(
-        self, performance_client, large_mock_file_content, benchmark
-    ):
-        """Test polling performance when processing completes quickly."""
-        with requests_mock.Mocker() as m:
-            # Mock extract
-            m.post(
-                "https://api.performance.test/extract/fast_process",
-                json={"file_hash": "fast_hash_123", "status": "PROCESSING"},
-                status_code=200,
-            )
-
-            # Mock immediate completion
-            m.get(
-                "https://api.performance.test/status?file_hash=fast_hash_123",
-                json={"status": "COMPLETED"},
-                status_code=200,
-            )
-
-            # Mock retrieve
-            m.get(
-                "https://api.performance.test/retrieve?file_hash=fast_hash_123",
-                json={"result": "fast_completion_data"},
-                status_code=200,
-            )
-
-            def fast_completion_workflow():
-                with patch(
-                    "builtins.open", mock_open(read_data=large_mock_file_content)
-                ):
-                    with patch("time.sleep"):
-                        return performance_client.extract(
-                            endpoint="fast_process",
-                            vertical="table",
-                            sub_vertical="fast_process",
-                            file_path="/test/file.pdf",
-                            wait_for_completion=True,
-                            polling_interval=0.1,
-                        )
-
-            result = benchmark(fast_completion_workflow)
-            assert result["result"] == "fast_completion_data"
-
-    def test_polling_performance_slow_completion(
-        self, performance_client, large_mock_file_content, benchmark
-    ):
-        """Test polling performance with multiple status checks."""
-        with requests_mock.Mocker() as m:
-            # Mock extract
-            m.post(
-                "https://api.performance.test/extract/slow_process",
-                json={"file_hash": "slow_hash_456", "status": "PROCESSING"},
-                status_code=200,
-            )
-
-            # Mock multiple processing status responses
-            status_responses = []
-            for i in range(5):  # 5 polling cycles
-                status_responses.append(
-                    {
-                        "json": {"status": "PROCESSING", "progress": i * 20},
-                        "status_code": 200,
-                    }
-                )
-            status_responses.append(
-                {"json": {"status": "COMPLETED", "progress": 100}, "status_code": 200}
-            )
-
-            for response in status_responses:
-                m.get(
-                    "https://api.performance.test/status?file_hash=slow_hash_456",
-                    **response,
-                )
-
-            # Mock retrieve
-            m.get(
-                "https://api.performance.test/retrieve?file_hash=slow_hash_456",
-                json={"result": "slow_completion_data"},
-                status_code=200,
-            )
-
-            def slow_completion_workflow():
-                with patch(
-                    "builtins.open", mock_open(read_data=large_mock_file_content)
-                ):
-                    with patch("time.sleep"):  # Mock sleep to avoid actual delays
-                        return performance_client.extract(
-                            endpoint="slow_process",
-                            vertical="table",
-                            sub_vertical="slow_process",
-                            file_path="/test/file.pdf",
-                            wait_for_completion=True,
-                            polling_interval=0.1,
-                        )
-
-            result = benchmark(slow_completion_workflow)
-            assert result["result"] == "slow_completion_data"
-
-    def test_multiple_sequential_requests_performance(
-        self, performance_client, large_mock_file_content, benchmark
-    ):
-        """Test performance of multiple sequential API requests."""
-        with requests_mock.Mocker() as m:
-            # Mock multiple different endpoints
-            endpoints = ["discover", "extract", "process", "analyze"]
-
-            for i, endpoint in enumerate(endpoints):
-                m.post(
-                    f"https://api.performance.test/extract/{endpoint}",
-                    json={"file_hash": f"hash_{i}", "status": "PROCESSING"},
-                    status_code=200,
-                )
-
-                m.get(
-                    f"https://api.performance.test/status?file_hash=hash_{i}",
-                    json={"status": "COMPLETED"},
-                    status_code=200,
-                )
-
-                m.get(
-                    f"https://api.performance.test/retrieve?file_hash=hash_{i}",
-                    json={"result": f"data_{endpoint}"},
-                    status_code=200,
-                )
-
-            def sequential_requests():
-                results = []
-                with patch(
-                    "builtins.open", mock_open(read_data=large_mock_file_content)
-                ):
-                    with patch("time.sleep"):
-                        for endpoint in endpoints:
-                            result = performance_client.extract(
-                                endpoint=endpoint,
-                                vertical="table",
-                                sub_vertical=endpoint,
-                                file_path="/test/file.pdf",
-                                wait_for_completion=True,
-                                polling_interval=0.1,
-                            )
-                            results.append(result)
-                return results
-
-            results = benchmark(sequential_requests)
-            assert len(results) == 4
-            for i, result in enumerate(results):
-                assert result["result"] == f"data_{endpoints[i]}"
-
-    def test_memory_usage_large_response(
-        self, performance_client, large_mock_file_content, benchmark
-    ):
-        """Test memory efficiency with large API responses."""
-        # Create a large mock response
-        large_response = {
-            "file_hash": "large_response_hash",
-            "result": {
-                "data": ["row_" + str(i) for i in range(10000)],  # Large dataset
-                "metadata": {"size": "large", "processing_time": 60},
-            },
-        }
-
-        with requests_mock.Mocker() as m:
-            m.post(
-                "https://api.performance.test/extract/large_response",
-                json={"file_hash": "large_response_hash", "status": "PROCESSING"},
-                status_code=200,
-            )
-
-            m.get(
-                "https://api.performance.test/status?file_hash=large_response_hash",
-                json={"status": "COMPLETED"},
-                status_code=200,
-            )
-
-            m.get(
-                "https://api.performance.test/retrieve?file_hash=large_response_hash",
-                json=large_response,
-                status_code=200,
-            )
-
-            def large_response_workflow():
-                with patch(
-                    "builtins.open", mock_open(read_data=large_mock_file_content)
-                ):
-                    with patch("time.sleep"):
-                        return performance_client.extract(
-                            endpoint="large_response",
-                            vertical="table",
-                            sub_vertical="large_response",
-                            file_path="/test/file.pdf",
-                            wait_for_completion=True,
-                            polling_interval=0.1,
-                        )
-
-            result = benchmark(large_response_workflow)
-            assert len(result["result"]["data"]) == 10000
-
-    def test_api_request_overhead(self, performance_client, benchmark):
-        """Test the overhead of API request setup and teardown."""
-        with requests_mock.Mocker() as m:
-            m.get(
-                "https://api.performance.test/status?file_hash=overhead_test",
-                json={"status": "COMPLETED"},
-                status_code=200,
-            )
-
-            def simple_status_check():
-                return performance_client.get_status("overhead_test")
-
-            result = benchmark(simple_status_check)
-            assert result["status"] == "COMPLETED"
-
-    def test_concurrent_status_checks(self, performance_client, benchmark):
-        """Test performance of rapid consecutive status checks."""
-        with requests_mock.Mocker() as m:
-            # Mock status endpoint
-            m.get(
-                "https://api.performance.test/status",
-                json={"status": "PROCESSING"},
-                status_code=200,
-            )
-
-            def rapid_status_checks():
-                file_hashes = [f"hash_{i}" for i in range(10)]
-                results = []
-                for hash_id in file_hashes:
-                    result = performance_client.get_status(hash_id)
-                    results.append(result)
-                return results
-
-            results = benchmark(rapid_status_checks)
-            assert len(results) == 10
-            assert all(r["status"] == "PROCESSING" for r in results)
-
-    @pytest.mark.parametrize("file_size_multiplier", [1, 10, 100])
-    def test_file_size_impact_on_performance(
-        self, performance_client, file_size_multiplier, benchmark
-    ):
-        """Test how file size affects upload performance."""
-        file_content = b"Content " * (1000 * file_size_multiplier)
-
-        with requests_mock.Mocker() as m:
-            m.post(
-                "https://api.performance.test/extract/size_test",
-                json={"file_hash": f"size_hash_{file_size_multiplier}"},
-                status_code=200,
-            )
-
-            def upload_sized_file():
-                with patch("builtins.open", mock_open(read_data=file_content)):
-                    return performance_client.extract(
-                        endpoint="size_test",
-                        vertical="table",
-                        sub_vertical="size_test",
-                        file_path="/test/sized_file.pdf",
-                    )
-
-            result = benchmark(upload_sized_file)
-            assert result["file_hash"] == f"size_hash_{file_size_multiplier}"
-
-    def test_error_handling_performance(self, performance_client, benchmark):
-        """Test performance impact of error handling."""
-        with requests_mock.Mocker() as m:
-            m.post(
-                "https://api.performance.test/extract/error_test",
-                text="Internal Server Error",
-                status_code=500,
-            )
-
-            def error_handling_operation():
-                try:
-                    with patch("builtins.open", mock_open(read_data=b"test")):
-                        performance_client.extract(
-                            endpoint="error_test",
-                            vertical="table",
-                            sub_vertical="error_test",
-                            file_path="/test/file.pdf",
-                        )
-                except Exception:
-                    return "error_handled"
-
-            result = benchmark(error_handling_operation)
-            assert result == "error_handled"

From ab351948f579105ecd90e6c9f804329ff80f31f0 Mon Sep 17 00:00:00 2001
From: Jaseem Jas <jaseem@zipstack.com>
Date: Sun, 24 Aug 2025 12:37:07 +0530
Subject: [PATCH 2/2] remove: test_imports.py file as requested

---
 test/test_imports.py | 151 -------------------------------------------
 1 file changed, 151 deletions(-)
 delete mode 100644 test/test_imports.py

diff --git a/test/test_imports.py b/test/test_imports.py
deleted file mode 100644
index 93cc5ee..0000000
--- a/test/test_imports.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""Test module imports and package-level functionality."""
-
-
-class TestPackageImports:
-    """Test cases for package imports."""
-
-    def test_main_package_imports(self):
-        """Test importing main classes from the package."""
-        # This should import all main classes and trigger __init__.py coverage
-        from apihub_client import (
-            ApiHubClient,
-            ApiHubClientException,
-            DocSplitterClient,
-            GenericUnstractClient,
-        )
-
-        # Verify classes are importable and are actually classes
-        assert ApiHubClient is not None
-        assert ApiHubClientException is not None
-        assert DocSplitterClient is not None
-        assert GenericUnstractClient is not None
-
-        # Verify they are actually classes/exceptions
-        assert callable(ApiHubClient)
-        assert callable(ApiHubClientException)
-        assert callable(DocSplitterClient)
-        assert callable(GenericUnstractClient)
-
-    def test_package_metadata(self):
-        """Test package metadata is accessible."""
-        import apihub_client
-
-        # Check metadata attributes exist
-        assert hasattr(apihub_client, "__version__")
-        assert hasattr(apihub_client, "__author__")
-        assert hasattr(apihub_client, "__email__")
-        assert hasattr(apihub_client, "__all__")
-
-        # Check metadata values
-        assert apihub_client.__version__ == "0.1.1"
-        assert apihub_client.__author__ == "Unstract Team"
-        assert apihub_client.__email__ == "support@unstract.com"
-
-        # Check __all__ contains expected items
-        expected_all = [
-            "ApiHubClient",
-            "ApiHubClientException",
-            "DocSplitterClient",
-            "GenericUnstractClient",
-        ]
-        assert apihub_client.__all__ == expected_all
-
-    def test_direct_module_imports(self):
-        """Test direct module imports work."""
-        from apihub_client.client import ApiHubClient, ApiHubClientException
-        from apihub_client.doc_splitter import DocSplitterClient
-        from apihub_client.generic_client import GenericUnstractClient
-
-        # Verify classes are importable
-        assert ApiHubClient is not None
-        assert ApiHubClientException is not None
-        assert DocSplitterClient is not None
-        assert GenericUnstractClient is not None
-
-    def test_client_instantiation(self):
-        """Test that clients can be instantiated from package imports."""
-        from apihub_client import (
-            ApiHubClient,
-            DocSplitterClient,
-            GenericUnstractClient,
-        )
-
-        # Test ApiHubClient instantiation
-        api_client = ApiHubClient(api_key="test_key", base_url="https://test.com")
-        assert api_client.api_key == "test_key"
-        assert api_client.base_url == "https://test.com"
-
-        # Test DocSplitterClient instantiation
-        doc_client = DocSplitterClient(api_key="test_key", base_url="https://test.com")
-        assert doc_client.api_key == "test_key"
-        assert doc_client.base_url == "https://test.com"
-
-        # Test GenericUnstractClient instantiation
-        generic_client = GenericUnstractClient(
-            api_key="test_key", base_url="https://test.com"
-        )
-        assert generic_client.api_key == "test_key"
-        assert generic_client.base_url == "https://test.com"
-
-    def test_exception_instantiation(self):
-        """Test that exception can be instantiated from package imports."""
-        from apihub_client import ApiHubClientException
-
-        # Test exception creation
-        exc = ApiHubClientException("Test message", 400)
-        assert exc.message == "Test message"
-        assert exc.status_code == 400
-
-        # Test exception string representation
-        str_repr = str(exc)
-        assert "Test message" in str_repr
-        assert "400" in str_repr
-
-    def test_star_import(self):
-        """Test that star import works correctly."""
-        # This imports everything in __all__
-        exec("from apihub_client import *")  # noqa: S102
-
-        # Check that the main classes are available in local scope
-        locals_dict = locals()
-        assert "ApiHubClient" in locals_dict
-        assert "ApiHubClientException" in locals_dict
-        assert "DocSplitterClient" in locals_dict
-        assert "GenericUnstractClient" in locals_dict
-
-    def test_package_docstring(self):
-        """Test package docstring is accessible."""
-        import apihub_client
-
-        assert apihub_client.__doc__ is not None
-        assert "Unstract API Hub Python Client" in apihub_client.__doc__
-        assert "dynamic, extensible Python client" in apihub_client.__doc__
-
-    def test_import_order_independence(self):
-        """Test that imports work regardless of order."""
-        # Import in different order
-        from apihub_client import (
-            ApiHubClient,  # noqa: F401
-            ApiHubClientException,  # noqa: F401
-            DocSplitterClient,  # noqa: F401
-            GenericUnstractClient,
-        )
-
-        # Should work fine
-        client = GenericUnstractClient(api_key="test", base_url="https://test.com")
-        assert client.api_key == "test"
-
-    def test_submodule_access(self):
-        """Test that submodules are accessible through the package."""
-        import apihub_client
-
-        # Should be able to access submodules
-        assert hasattr(apihub_client, "client")
-        assert hasattr(apihub_client, "doc_splitter")
-        assert hasattr(apihub_client, "generic_client")
-
-        # Should be able to access classes through submodules
-        assert hasattr(apihub_client.client, "ApiHubClient")
-        assert hasattr(apihub_client.client, "ApiHubClientException")
-        assert hasattr(apihub_client.doc_splitter, "DocSplitterClient")
-        assert hasattr(apihub_client.generic_client, "GenericUnstractClient")